Ejemplo n.º 1
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          teacher,
          tau=0.01,
          eval_env=True,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    t = datetime.now().strftime('%H-%M')
    PATH = 'results/ddpg'.format(t)

    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        agent.restore_model(PATH)
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)

                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            action)
                        eval_env.background = get_q_background(
                            eval_env, agent.q, eval_action)

                        # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            agent.save_model(PATH, epoch)
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 2
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          restore=True):
    rank = MPI.COMM_WORLD.Get_rank()

    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape, (env.action_space.shape[0], ),
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 observation_range=(env.observation_space.low[0],
                                    env.observation_space.high[0]),
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up saving stuff only for a single worker.
    savingModelPath = "/home/joel/Documents/saved_models_OpenAI_gym/"
    if rank == 0:
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=1)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.

        # from https://github.com/openai/baselines/issues/162#issuecomment-397356482 and
        # https://www.tensorflow.org/api_docs/python/tf/train/import_meta_graph

        if restore == True:
            # restoring doesn't actually work
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(savingModelPath +
                                               "ddpg_test_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(savingModelPath))
        else:
            logger.info("Starting from scratch!")
            sess.run(tf.global_variables_initializer()
                     )  # this should happen here and not in the agent right?

        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0
        t_rollout = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            start_time_epoch = time.time()
            for cycle in range(nb_epoch_cycles):
                start_time_cycle = time.time()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # while(not done):
                    start_time_rollout = time.time()
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    logging.debug("q-value of selected action: {}".format(q))

                    # np.set_printoptions(precision=3)
                    logging.debug(
                        "selected (unscaled) action: " +
                        str(action))  # e.g. [ 0.04  -0.662 -0.538  0.324]
                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    target = scale_range(action, -1, 1, env.action_space.low,
                                         env.action_space.high)
                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert target.shape == env.action_space.shape
                    new_obs, r, done, info = env.step(target)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done or t_rollout >= nb_rollout_steps - 1:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()
                    # t_rollout += 1
                    logger.info(
                        'runtime rollout-step {0}.{1}.{2}: {3}s'.format(
                            epoch, cycle, t_rollout,
                            time.time() - start_time_rollout))
                # for rollout_steps

                # Train.
                logging.info("Training the Agent")
                start_time_train = time.time()
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):  # 50 iterations
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise(
                        )  # e.g. 0.7446093559265137
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    logging.debug(
                        "critic loss: {}".format(cl))  # e.g. 25.988863
                    logging.debug(
                        "actor loss: {}".format(al))  # e.g. -0.008966461

                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.
                logger.info('runtime training actor & critic: {}s'.format(
                    time.time() - start_time_train))

                # Saving the trained model
                if (saver is not None):
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    saver.save(sess, savingModelPath + "ddpg_test_model")
                    logger.info('runtime saving: {}s'.format(time.time() -
                                                             start_time_save))

                done = False

                logger.info('runtime epoch-cycle {0}: {1}s'.format(
                    cycle,
                    time.time() - start_time_cycle))
            # for epoch_cycles

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
                logging.info("\t{0} : {1}".format(key, combined_stats[key]))
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # Saving the trained model
            if (saver is not None):
                logger.info("saving the trained model")
                start_time_save = time.time()
                saver.save(sess,
                           savingModelPath + "ddpg_model_epochSave",
                           global_step=epoch)
                logger.info('runtime saving: {}s'.format(time.time() -
                                                         start_time_save))

            logger.info('runtime epoch {0}: {1}s'.format(
                epoch,
                time.time() - start_time_epoch))
Ejemplo n.º 3
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          load_network_id,
          latest,
          plot_info,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        if (load_network_id):
            agent.load_actor_critic(id=load_network_id)
        if (latest):
            agent.load_actor_critic(latest=True)

        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_distances2target = []
        epoch_episode_relative_alt = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        logger.info('EPISODE OVER!')
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        epoch_episode_distances2target.append(
                            info['dist2target'])
                        epoch_episode_relative_alt.append(info['relative_alt'])
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        if (episodes % 10 == 0):
                            agent.save_actor_critic(id=episodes)

                        if (episodes % 2 == 0 and plot_info):
                            plot_information(epoch_episode_distances2target,
                                             epoch_episode_rewards,
                                             epoch_episode_relative_alt)
                            plt.pause(0.1)

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Update learning rates
            if (epoch % 5 == 0 and epoch > 0):
                agent.update_lr(agent.actor_lr * 0.65, agent.critic_lr * 0.65)

    logger.info('Finished training')
Ejemplo n.º 4
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          perform=False,
          expert=None,
          save_networks=False,
          supervise=False,
          pre_epoch=60,
          actor_only=False,
          critic_only=False,
          both_ours_sup=False,
          gail=False,
          pofd=False):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 expert=expert,
                 save_networks=save_networks,
                 supervise=supervise,
                 actor_only=actor_only,
                 critic_only=critic_only,
                 both_ours_sup=both_ours_sup,
                 gail=gail,
                 pofd=pofd)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        network_saving_dir = os.path.join('./saved_networks',
                                          env.env.spec.id) + '/'
        if not os.path.exists(network_saving_dir):
            os.makedirs(network_saving_dir)
        agent.initialize(sess, saver, network_saving_dir, 10000, 30000)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        if expert is None:
            pretrain = False
        else:
            pretrain = True
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        small_buffer = []
        big_buffer = []
        for epoch in range(nb_epochs):
            if epoch >= pre_epoch and pretrain:
                pretrain = False
                logger.info('Stoped pretrain at epoch {}'.format(epoch))
            for cycle in range(nb_epoch_cycles):
                if not perform:
                    # Perform rollouts.
                    for t_rollout in range(nb_rollout_steps):
                        # Predict next action.
                        action, q = agent.pi(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                        assert action.shape == env.action_space.shape

                        # Execute next action.
                        if rank == 0 and render:
                            env.render()
                        assert max_action.shape == action.shape
                        new_obs, r, done, info = env.step(
                            max_action * action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                        t += 1
                        if rank == 0 and render:
                            env.render()
                        episode_reward += r
                        episode_step += 1

                        # Book-keeping.
                        epoch_actions.append(action)
                        epoch_qs.append(q)
                        agent.store_transition(obs, action, r, new_obs, done)
                        obs = new_obs

                        if done:
                            # Episode done.
                            epoch_episode_rewards.append(episode_reward)
                            episode_rewards_history.append(episode_reward)
                            epoch_episode_steps.append(episode_step)
                            episode_reward = 0.
                            episode_step = 0
                            epoch_episodes += 1
                            episodes += 1

                            agent.reset()
                            obs = env.reset()

                    # Train.
                    epoch_actor_losses = []
                    epoch_critic_losses = []
                    epoch_adaptive_distances = []
                    for t_train in range(nb_train_steps):
                        # Adapt param noise, if necessary.
                        if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                            distance = agent.adapt_param_noise()
                            epoch_adaptive_distances.append(distance)

                        cl, al = agent.train(pretrain)
                        epoch_critic_losses.append(cl)
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        old_eval_obs = eval_obs
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                        if perform:
                            small_buffer.append([
                                old_eval_obs, eval_action, eval_r, eval_obs,
                                eval_done
                            ])

                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

                            if perform and len(small_buffer) > 0:
                                big_buffer.append(small_buffer)
                                small_buffer = []
                                if len(big_buffer
                                       ) > 0 and len(big_buffer) % 1000 == 0:
                                    expert_dir = os.path.join(
                                        './expert', env.env.spec.id) + '/'
                                    if not os.path.exists(expert_dir):
                                        os.makedirs(expert_dir)
                                    pwritefile = open(
                                        os.path.join(expert_dir, 'expert.pkl'),
                                        'wb')
                                    pickle.dump(big_buffer, pwritefile, -1)
                                    pwritefile.close()
                                    logger.info('Expert data saved!')
                                    return

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            combined_stats = {}
            if not perform:
                stats = agent.get_stats()
                for key in sorted(stats.keys()):
                    combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            if not perform:
                combined_stats['rollout/return'] = mpi_mean(
                    epoch_episode_rewards)
                combined_stats['rollout/return_history'] = mpi_mean(
                    np.mean(episode_rewards_history))
                combined_stats['rollout/episode_steps'] = mpi_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = mpi_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
                combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

                # Train statistics.
                combined_stats['train/loss_actor'] = mpi_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = mpi_mean(
                    epoch_critic_losses)
                combined_stats['train/param_noise_distance'] = mpi_mean(
                    epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))
            if not perform:
                # Total statistics.
                combined_stats['total/duration'] = mpi_mean(duration)
                combined_stats['total/steps_per_second'] = mpi_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = mpi_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 5
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          overwrite_memory,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          logdir,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          eval_jump,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          agentName=None,
          resume=0,
          max_to_keep=100):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver(max_to_keep=max_to_keep)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    logF = open(os.path.join(logdir, 'log.txt'), 'a')
    logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a')
    logReward = open(os.path.join(logdir, 'logReward.txt'), 'a')

    with U.single_threaded_session() as sess:
        # Prepare everything.
        if (resume == 0):
            agent.initialize(sess, max_to_keep=max_to_keep)
        else:
            #restore = "{}-{}".format(agentName,resume)
            agent.initialize(sess,
                             path=os.path.abspath(logdir),
                             restore=agentName,
                             itr=resume,
                             overwrite=overwrite_memory,
                             max_to_keep=max_to_keep)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(resume, resume + nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        print("Epoch " + str(epoch) + " episodes " +
                              str(episodes) + " steps " + str(episode_step) +
                              " reward " + str(episode_reward))
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None and epoch % eval_jump == 0:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            print("Eval reward " + str(eval_episode_reward))
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None and epoch % eval_jump == 0:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            #    logdir = logger.get_dir()
            if rank == 0:
                logReward.write(
                    str(epoch) + "," + str(combined_stats["rollout/return"]) +
                    "\n")
                logReward.flush()
                logF.write(str(combined_stats["rollout/return"]) + "\n")
                json.dump(combined_stats, logStats)
                logF.flush()
                logStats.flush()

                #    if not os.path.exists(os.path.abspath(logdir)):
                #        os.makedirs(os.path.abspath(logdir), exist_ok=True)
                #    print("logdir = ", logdir)
                #    with open(os.path.join(logdir, "{}_{}".format(agentName, agent.itr.eval())), 'wb') as f:
                #        pickle.dump(agent, f)

                agent.save(path=logdir,
                           name=agentName,
                           overwrite=overwrite_memory)
                logger.info("agent {} saved".format(agent.itr.eval()))
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 6
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          save_path=None,
          restore_path=None,
          hindsight_mode=None):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                transitions = []
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    transitions.append((obs, action, r, new_obs, done))
                    #agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # store regular transitions into replay memory
                for (obs, action, r, new_obs, done) in transitions:
                    agent.store_transition(obs, action, r, new_obs, done)

                if hindsight_mode in ['final', 'future']:
                    for (obs, action, r, new_obs,
                         done) in replay_final(transitions, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)

                if hindsight_mode in ['future']:
                    for (obs, action, r, new_obs,
                         done) in replay_future(transitions, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)

                # store hindsight transitions.
                '''for i in range(3):
                    # sample a random point in the trajectory
                    idx = np.random.randint(0, len(transitions))
                    obs, action, r, new_obs, done = transitions[idx]
                    # create a goal from that point
                    goal = env.env.obs_to_goal(new_obs)
                    for (obs, action, r, new_obs, done) in replay_with_goal(transitions[:idx+1], goal, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)
                obs, action, r, new_obs, done = transitions[-1]

                # store a "final" transition.
                goal = env.env.obs_to_goal(new_obs)
                for (obs, action, r, new_obs, done) in replay_with_goal(transitions, goal, env.env):
                    agent.store_transition(obs, action, r, new_obs, done)'''

                # Train.

                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['reward'] = mpi_mean(epoch_episode_rewards)
            # combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
            combined_stats['episode_steps'] = mpi_mean(epoch_episode_steps)
            combined_stats['episodes'] = mpi_sum(epoch_episodes)
            # combined_stats['actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['actions_std'] = mpi_std(epoch_actions)
            combined_stats['Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['policy_loss'] = mpi_mean(epoch_actor_losses)
            combined_stats['value_loss'] = mpi_mean(epoch_critic_losses)
            combined_stats['param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/reward'] = mpi_mean(eval_episode_rewards)
                # combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q_mean'] = mpi_mean(eval_qs)
                # combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))

            # Total statistics.
            # combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            # combined_stats['total/episodes'] = mpi_mean(episodes)
            # combined_stats['total/epochs'] = epoch + 1
            # combined_stats['total/steps'] = t
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 7
0
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0,
    reward_param_thr = 70, reward_param_type='const'):
    
    #save data
    ####################################
    full_path = txt_path + '_etc_RL.txt'
    file = open(full_path,'w')
    print('Start training for env: '+env_id)
    #change to your dir of choice for saving
    save_path = os.getcwd() 
    print('Save data at '+save_path+'. Change to your desired path.')

    
    dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl'
    append_num = 0
    while os.path.exists(os.path.join(save_path,dump_name)):
        dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl'
        append_num+=1

    rank = MPI.COMM_WORLD.Get_rank()
    print('second rank is ',rank)
    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.##############danny
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver(max_to_keep = 1)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0
        
        
        epoch = 0
        start_time = time.time()

        epoch_com_sav = []

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        count_fwj = 0 # used to count how many step() is played
        for epoch in range(nb_epochs):
            print(nb_epochs)
            # collect data for saving plot
            save_data = {'act': [],
                         'obs': [],
                         'qpos':[],
                         'rew':[], # reward for this episode
                         'freq_com':[], # communication frequency
                         'act_ts': [],
                         'obs_ts': [],
                         'qpos_ts': [],
                         'rew_ts': [],  # reward for this episode
                         'freq_com_ts': [],  # communication frequency
                         'comm_r_factor':reward_param_scaling,
                         'eplen_ts':[] # len of test episodes
                         }

            # decay the exploration
            e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1)
            explore_switch = (t < 20000 and eg_explore and e_greed > 0)
            print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch))
            
            for cycle in range(nb_epoch_cycles):
                
                # Perform rollouts.

                # init u_old, don't forget to change test also
                u_old = 1.0 * env.action_space.sample() / max_action

                num_no_com = 0
                for t_rollout in range(nb_rollout_steps):

                    count_fwj+=1
                    print ('test played is ###################',count_fwj)
                    # Predict next action.
                    # edit this to be param version
                    if len(obs) is not 6:
                        obs.append(obs[2]-obs[0])
                        obs.append(obs[3]-obs[1])
                        a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True)

                    else: 
                        #a_1 = np.zeros()
                        a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True)
                        print('value of q is',q)
                        if count_fwj % 50000 == 0:
                            saver.save(sess,'./home/test_3_ar.ckpt',global_step =1)
                    a0 = a_raw[0]
                    a1 = a_raw[1]

                    # eps greedy, flip the coin
                    # make eps decay first 10k updates
                    dice_greed = np.random.uniform()
                    if explore_switch and dice_greed < e_greed:
                        com = ( np.random.uniform() > 0.5 )
                    else:
                        com = (a0 > a1)

                    # action according to com switch
                    if com:
                        r_com = 0.0
                        action = np.copy(a_raw[2:]) #No communication
                        num_no_com += 1             #No communication
                    else:
                        if reward_param_type=='const':
                            r_com = 1. # const reward
                        elif reward_param_type=='linear':
                            r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward
                        elif reward_param_type=='inv':
                            r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0)))  # inv decay reward
                        else:
                            print('no such reward type!')
                            assert 1==0

                        r_com = reward_param_scaling * r_com
                        #action = np.copy(u_old)
                        action = np.copy(a_raw[2:])
                        num_no_com += 1

                    assert action.shape == env.action_space.shape

                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(action) 

                    print(done)

                    file.write(str(new_obs)+',q_value_is,'+str(q)+',step_reward,'+str(r)+',action used,' + str(max_action*action)+'\n')


                    t += 1
                    if rank == 0 and render:
                        pass

                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(a_raw)
                    epoch_qs.append(q)

                    agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done)
                    obs = np.squeeze(new_obs)

                    save_data['act'].append(np.array(action))
                    save_data['obs'].append(np.array(obs))
                    if hasattr(env.unwrapped, 'data'):
                        save_data['qpos'].append(np.array(env.unwrapped.data.qpos))

                    u_old = np.copy(action)

                    if done:


                        # Episode done.
                        epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step))

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        print('one game finished, count is =================================',env.count)
                        file.write('count is,'+str(env.count))
                        file.write('done is,' + str(done))
                        file.write('long term reward is,' + str(env.long_term_reward))
                        file.write('#'*12+'one game finished\n')

                        agent.reset()
                        obs = env.reset()

                #end of loop nb_rollout
                print('communication savings: ' + str(num_no_com)) # check com number
                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()

            # log stuff
            save_data['rew'].append(np.mean(epoch_episode_rewards))
            save_data['freq_com'].append(np.mean(epoch_com_sav))

            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        ###===============================================
        # test the fully-trained agent
        env = env.unwrapped
Ejemplo n.º 8
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          **kwargs):

    # print("kwargs:",kwargs)

    rank = MPI.COMM_WORLD.Get_rank()
    print("rank:", rank)
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:
        # Prepare everything.
        # --------------- AMEND: For saving and restoring the model. added by xlv ------------------
        if kwargs['restore'] == True and kwargs['restore_path'] != None:
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(restore_path +
                                               "trained_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(restore_path))
        else:
            logger.info("Starting from scratch!")
            sess.run(tf.global_variables_initializer())
        # ----------------------------------------------------------------------------------------
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = eval_obs = env.reset()

        # if eval_env is not None:
        #     eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []

        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        # every 30 epochs plot statistics and save it.
        nb_epochs_unit = 30
        ddpg_rewards = []
        eval_ddpg_rewards = []

        ddpg_suc_percents = []
        eval_suc_percents = []

        # ---- AMEND: added by xlv to calculate success percent -----
        suc_num = 0
        episode_num = 0
        # -----------------------------------------------------------
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    # new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, suc, info = env.step(max_action * action)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        # --- AMEND: added by xlv to calculate success percent ---
                        episode_num += 1
                        if suc:
                            suc_num += 1
                        # -------------------------------------------------------
                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                # eval_episode_rewards = []
                # eval_qs = []
                # if eval_env is not None:
                #     eval_episode_reward = 0.
                #     for t_rollout in range(nb_eval_steps):
                #         eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                #         eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                #         if render_eval:
                #             eval_env.render()
                #         eval_episode_reward += eval_r
                #
                #         eval_qs.append(eval_q)
                #         if eval_done:
                #             eval_obs = eval_env.reset()
                #             eval_episode_rewards.append(eval_episode_reward)
                #             eval_episode_rewards_history.append(eval_episode_reward)
                #             eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            # if eval_env is not None:
            #     combined_stats['eval/return'] = eval_episode_rewards
            #     combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            #     combined_stats['eval/Q'] = eval_qs
            #     combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # ------------------------------ plot statistics every nb_epochs_unit -----------------------------------
            ddpg_rewards.append(np.mean(episode_rewards_history))
            if (epoch + 1) % nb_epochs_unit == 0:
                ddpg_suc_percents.append(suc_num / episode_num)
                # ---------- Evaluate for 5 iters -----------------------
                nb_eval_epochs = 5
                nb_eval_epoch_cycles = 5
                eval_episode_num = 0
                eval_suc_num = 0

                eval_episode_reward = 0
                eval_episode_step = 0

                eval_epoch_episode_rewards = []
                eval_epoch_episode_steps = []
                for i_epoch in range(nb_eval_epochs):
                    logger.log(
                        "********** Start Evaluation. Iteration %i ************"
                        % i_epoch)
                    for i_cycle in range(nb_eval_epoch_cycles):
                        for t_rollout in range(nb_rollout_steps):
                            eval_action, eval_q = agent.pi(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                            assert eval_action.shape == env.action_space.shape
                            eval_obs, eval_r, eval_done, eval_suc, eval_info = env.step(
                                max_action * eval_action
                            )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                            eval_episode_reward += eval_r
                            eval_episode_step += 1
                            if eval_done:
                                eval_obs = env.reset()
                                eval_epoch_episode_rewards.append(
                                    eval_episode_reward)
                                eval_episode_rewards_history.append(
                                    eval_episode_reward)
                                eval_epoch_episode_steps.append(
                                    eval_episode_step)
                                eval_episode_reward = 0
                                eval_episode_step = 0

                                eval_episode_num += 1
                                if eval_suc:
                                    eval_suc_num += 1
                    logger.record_tabular(
                        "Eval_EpRewMean",
                        np.mean(eval_episode_rewards_history))
                    logger.record_tabular("Eval_EpNumUntilNow",
                                          eval_episode_num)
                    logger.record_tabular("Eval_EpNumSuc", eval_suc_num)
                    logger.record_tabular("Eval_EpSucPercent",
                                          eval_suc_num / eval_episode_num)
                    logger.dump_tabular()
                    eval_ddpg_rewards.append(
                        np.mean(eval_episode_rewards_history))
                eval_suc_percents.append(eval_suc_num / eval_episode_num)
                # ----------------------------------------------------------------------------------------------
                # --------------------- plotting and saving -------------------------
                if saver is not None:
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    if epoch + 1 == nb_epochs:
                        saver.save(sess,
                                   kwargs['MODEL_DIR'] + "/trained_model")
                    else:
                        saver.save(
                            sess, kwargs['MODEL_DIR'] + "/iter_" + str(
                                (epoch + 1) // nb_epochs_unit))

                plot_performance(range(len(ddpg_rewards)),
                                 ddpg_rewards,
                                 ylabel=r'avg reward per DDPG learning step',
                                 xlabel='ddpg iteration',
                                 figfile=os.path.join(kwargs['FIGURE_DIR'],
                                                      'ddpg_reward'),
                                 title='TRAIN')
                plot_performance(
                    range(len(ddpg_suc_percents)),
                    ddpg_suc_percents,
                    ylabel=
                    r'overall success percentage per algorithm step under DDPG',
                    xlabel='algorithm iteration',
                    figfile=os.path.join(kwargs['FIGURE_DIR'],
                                         'success_percent'),
                    title="TRAIN")

                plot_performance(range(len(eval_ddpg_rewards)),
                                 eval_ddpg_rewards,
                                 ylabel=r'avg reward per DDPG eval step',
                                 xlabel='ddpg iteration',
                                 figfile=os.path.join(kwargs['FIGURE_DIR'],
                                                      'eval_ddpg_reward'),
                                 title='EVAL')
                plot_performance(
                    range(len(eval_suc_percents)),
                    eval_suc_percents,
                    ylabel=
                    r'overall eval success percentage per algorithm step under DDPG',
                    xlabel='algorithm iteration',
                    figfile=os.path.join(kwargs['FIGURE_DIR'],
                                         'eval_success_percent'),
                    title="EVAL")

                # save data which is accumulated UNTIL iter i
                with open(
                        kwargs['RESULT_DIR'] + '/ddpg_reward_' + 'iter_' + str(
                            (epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f2:
                    pickle.dump(ddpg_rewards, f2)
                with open(
                        kwargs['RESULT_DIR'] + '/success_percent_' + 'iter_' +
                        str((epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as fs:
                    pickle.dump(ddpg_suc_percents, fs)

                # save evaluation data accumulated until iter i
                with open(
                        kwargs['RESULT_DIR'] + '/eval_ddpg_reward_' + 'iter_' +
                        str((epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f_er:
                    pickle.dump(eval_ddpg_rewards, f_er)
                with open(
                        kwargs['RESULT_DIR'] + '/eval_success_percent_' +
                        'iter_' + str(
                            (epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f_es:
                    pickle.dump(eval_suc_percents, f_es)
Ejemplo n.º 9
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          saved_model_basename,
          restore_model_name,
          crowdai_client,
          crowdai_token,
          reward_shaping,
          feature_embellishment,
          relative_x_pos,
          relative_z_pos,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saved_model_dir = 'saved-models/'
    if saved_model_basename is None:
        saved_model_basename = ''.join(
            random.choices(string.ascii_lowercase + string.digits, k=8))
    saved_model_path = saved_model_dir + saved_model_basename
    if restore_model_name:
        restore_model_path = restore_model_name
        if not pathlib.Path(restore_model_path + '.index').is_file():
            restore_model_path = saved_model_dir + restore_model_name
    max_to_keep = 500
    eval_reward_threshold_to_keep = 300
    saver = tf.train.Saver(max_to_keep=max_to_keep)
    adam_optimizer_store = dict()
    adam_optimizer_store['actor_optimizer'] = dict()
    adam_optimizer_store['critic_optimizer'] = dict()

    #eval_episode_rewards_history = deque(maxlen=100)
    #episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        try:
            if restore_model_name:
                logger.info("Restoring from model at", restore_model_path)
                #saver.restore(sess, tf.train.latest_checkpoint(model_path))
                saver.restore(sess, restore_model_path)
            else:
                logger.info("Creating new model")
                sess.run(tf.global_variables_initializer(
                ))  # this should happen here and not in the agent right?
        except InvalidArgumentError as exc:
            if "Assign requires shapes of both tensors to match." in str(exc):
                print("Unable to restore model from {:s}.".format(
                    restore_model_path))
                print(
                    "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)."
                )
                print(exc.message)
                sys.exit()
            else:
                raise exc

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()

        # restore adam optimizer
        try:
            if restore_model_name:
                logger.info("Restoring pkl file with adam state",
                            restore_model_path)
                #saver.restore(sess, tf.train.latest_checkpoint(model_path))
                adam_optimizer_store = pickle.load(
                    open(restore_model_path + ".pkl", "rb"))
                agent.actor_optimizer.m = adam_optimizer_store[
                    'actor_optimizer']['m']
                agent.actor_optimizer.v = adam_optimizer_store[
                    'actor_optimizer']['v']
                agent.actor_optimizer.t = adam_optimizer_store[
                    'actor_optimizer']['t']
                agent.critic_optimizer.m = adam_optimizer_store[
                    'critic_optimizer']['m']
                agent.critic_optimizer.v = adam_optimizer_store[
                    'critic_optimizer']['v']
                agent.critic_optimizer.t = adam_optimizer_store[
                    'critic_optimizer']['t']
                if 'param_noise' in adam_optimizer_store:
                    agent.param_noise = adam_optimizer_store['param_noise']
        except:
            print("Unable to restore adam state from {:s}.".format(
                restore_model_path))

        obs = env.reset()
        done = False
        episode_reward = 0.
        #episode_step = 0
        #episodes = 0
        #t = 0

        #epoch_episode_steps = []
        #epoch_episode_eval_rewards = []
        #epoch_episode_eval_steps = []
        #epoch_start_time = time.time()
        #epoch_actions = []
        #epoch_episodes = 0
        for epoch in range(nb_epochs):
            start_time = time.time()
            epoch_episode_rewards = []
            epoch_qs = []
            eval_episode_rewards = []
            eval_qs = []
            eval_steps = []
            epoch_actor_losses = []
            epoch_critic_losses = []
            worth_keeping = False
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    #new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, info = env.step(action)
                    #t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    #episode_step += 1

                    # Book-keeping.
                    #epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        #episode_rewards_history.append(episode_reward)
                        #epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        #episode_step = 0
                        #epoch_episodes += 1
                        #episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                #epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        #epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Submit to crowdai competition. What a hack. :)
                #if crowdai_client is not None and crowdai_token is not None and eval_env is not None:
                crowdai_submit_count = 0
                if crowdai_client is not None and crowdai_token is not None:
                    eval_obs_dict = crowdai_client.env_create(
                        crowdai_token, env_id="ProstheticsEnv")
                    eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                        eval_obs_dict,
                        reward_shaping=reward_shaping,
                        reward_shaping_x=1.,
                        feature_embellishment=feature_embellishment,
                        relative_x_pos=relative_x_pos,
                        relative_z_pos=relative_z_pos)
                    while True:
                        action, _ = agent.pi(eval_obs_projection,
                                             apply_noise=False,
                                             compute_Q=False)
                        submit_action = prosthetics_env.openai_to_crowdai_submit_action(
                            action)
                        clipped_submit_action = np.clip(submit_action, 0., 1.)
                        actions_equal = clipped_submit_action == submit_action
                        if not np.all(actions_equal):
                            logger.debug("crowdai_submit_count:",
                                         crowdai_submit_count)
                            logger.debug("  openai-action:", action)
                            logger.debug("  submit-action:", submit_action)
                        crowdai_submit_count += 1
                        [eval_obs_dict, reward, done,
                         info] = crowdai_client.env_step(
                             clipped_submit_action.tolist(), True)
                        #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True)
                        eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                            eval_obs_dict,
                            reward_shaping=reward_shaping,
                            reward_shaping_x=1.,
                            feature_embellishment=feature_embellishment,
                            relative_x_pos=relative_x_pos,
                            relative_z_pos=relative_z_pos)
                        if done:
                            logger.debug("done: crowdai_submit_count:",
                                         crowdai_submit_count)
                            eval_obs_dict = crowdai_client.env_reset()
                            if not eval_obs_dict:
                                break
                            logger.debug(
                                "done: eval_obs_dict exists after reset")
                            eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                                eval_obs_dict,
                                reward_shaping=reward_shaping,
                                reward_shaping_x=1.,
                                feature_embellishment=feature_embellishment,
                                relative_x_pos=relative_x_pos,
                                relative_z_pos=relative_z_pos)
                    crowdai_client.submit()
                    return  # kids, don't try any of these (expedient hacks) at home!

            if eval_env:
                eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes(
                    3, eval_env, agent, nb_eval_steps, render_eval)
                if eval_episode_reward_mean >= eval_reward_threshold_to_keep:
                    worth_keeping = True

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            if nb_epochs and nb_epoch_cycles and nb_train_steps > 0:
                #stats = agent.get_stats()
                #combined_stats = stats.copy()
                combined_stats = {}
                combined_stats['train/epoch_episode_reward_mean'] = np.mean(
                    epoch_episode_rewards)
                #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
                #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
                #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
                combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs)
                combined_stats['train/epoch_loss_actor'] = np.mean(
                    epoch_actor_losses)
                combined_stats['train/epoch_loss_critic'] = np.mean(
                    epoch_critic_losses)
                #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
                combined_stats['train/epoch_duration'] = duration
                #combined_stats['epoch/steps_per_second'] = float(t) / float(duration)
                #combined_stats['total/episodes'] = episodes
                #combined_stats['rollout/episodes'] = epoch_episodes
                #combined_stats['rollout/actions_std'] = np.std(epoch_actions)
                #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            else:
                combined_stats = {}
            # Evaluation statistics.
            if eval_env:
                combined_stats[
                    'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean  # np.mean(eval_episode_rewards)
                #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards)
                combined_stats[
                    'eval/epoch_Q_mean'] = eval_q_mean  # np.mean(eval_qs)
                #combined_stats['eval/episodes'] = len(eval_episode_rewards)
                combined_stats[
                    'eval/steps_mean'] = eval_step_mean  # np.mean(eval_steps)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            #combined_stats['total/epochs'] = epoch + 1
            #combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.info('')
            logger.info('Epoch', epoch)
            logger.dump_tabular()
            logdir = logger.get_dir()

            if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps:
                logger.info(
                    'Saving model to',
                    saved_model_dir + saved_model_basename + '-' + str(epoch))
                saver.save(sess,
                           saved_model_path,
                           global_step=epoch,
                           write_meta_graph=False)
                adam_optimizer_store['actor_optimizer'][
                    'm'] = agent.actor_optimizer.m
                adam_optimizer_store['actor_optimizer'][
                    'v'] = agent.actor_optimizer.v
                adam_optimizer_store['actor_optimizer'][
                    't'] = agent.actor_optimizer.t

                adam_optimizer_store['critic_optimizer'][
                    'm'] = agent.critic_optimizer.m
                adam_optimizer_store['critic_optimizer'][
                    'v'] = agent.critic_optimizer.v
                adam_optimizer_store['critic_optimizer'][
                    't'] = agent.critic_optimizer.t

                adam_optimizer_store['param_noise'] = agent.param_noise

                pickle.dump(
                    adam_optimizer_store,
                    open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb"))
                old_epoch = epoch - max_to_keep
                if old_epoch >= 0:
                    try:
                        os.remove(saved_model_path + "-" + str(old_epoch) +
                                  ".pkl")
                    except OSError:
                        pass

            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 10
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          callback=None,
          pretrained='none'):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Copy an env for evaluation
    env_eval = copy.deepcopy(env.env)

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        # load pretrained agent if possible
        if pretrained == 'none':
            logger.info('Training from scratch...')
        else:
            logger.info('Loading pretrained model from {}'.format(pretrained))
            #assert os.path.exists(pretrained)
            saver.restore(sess, pretrained)

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        total_time = 0
        start_time = time.time()

        total_time_record = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        #epochxposdict = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1
                    total_time += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        total_time_record.append(total_time)
                        #epochxposdict.append(info['pos'][0])
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    # eval for one episode
                    eval_episode_reward = 0.0
                    eval_done = False
                    eval_obs = eval_env.reset()
                    while not eval_done:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)
                        eval_episode_reward += eval_r
                        eval_qs.append(eval_q)
                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                """
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                """

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # Call the callback
            if callback is not None:
                if callback(locals(),
                            globals()):  # callback returns a boolean value
                    break
        # Evaluate the policy on env to record trajs
        eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate(
            env_eval, agent=agent)
        if callback is not None:
            callback.final_call(locals(), globals())
Ejemplo n.º 11
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    """
    Runs the training of the Deep Deterministic Policy Gradien (DDPG) model

    DDPG: https://arxiv.org/pdf/1509.02971.pdf

    :param env: (Gym Environment) the environment
    :param nb_epochs: (int) the number of training epochs
    :param nb_epoch_cycles: (int) the number cycles within each epoch
    :param render_eval: (bool) enable rendering of the evalution environment
    :param reward_scale: (float) the value the reward should be scaled by
    :param render: (bool) enable rendering of the environment
    :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None)
    :param actor: (TensorFlow Tensor) the actor model
    :param critic: (TensorFlow Tensor) the critic model
    :param normalize_returns: (bool) should the critic output be normalized
    :param normalize_observations: (bool) should the observation be normalized
    :param critic_l2_reg: (float) l2 regularizer coefficient
    :param actor_lr: (float) the actor learning rate
    :param critic_lr: (float) the critic learning rate
    :param action_noise: (ActionNoise) the action noise type (can be None)
    :param popart: (bool) enable pop-art normalization of the critic output
        (https://arxiv.org/pdf/1602.07714.pdf)
    :param gamma: (float) the discount rate
    :param clip_norm: (float) clip the gradients (disabled if None)
    :param nb_train_steps: (int) the number of training steps
    :param nb_rollout_steps: (int) the number of rollout steps
    :param nb_eval_steps: (int) the number of evalutation steps
    :param batch_size: (int) the size of the batch for learning the policy
    :param memory: (Memory) the replay buffer
    :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
    :param eval_env: (Gym Environment) the evaluation environment (can be None)
    :param param_noise_adaption_interval: (int) apply param noise every N steps
    """
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 enable_popart=popart,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        tf.train.Saver()

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with tf_util.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        step = 0

        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for _ in range(nb_epoch_cycles):
                # Perform rollouts.
                for _ in range(nb_rollout_steps):
                    # Predict next action.
                    action, q_value = agent.policy(obs,
                                                   apply_noise=True,
                                                   compute_q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, reward, done, _ = env.step(max_action * action)
                    step += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += reward
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q_value)
                    agent.store_transition(obs, action, reward, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    critic_loss, actor_loss = agent.train()
                    epoch_critic_losses.append(critic_loss)
                    epoch_actor_losses.append(actor_loss)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for _ in range(nb_eval_steps):
                        eval_action, eval_q = agent.policy(eval_obs,
                                                           apply_noise=False,
                                                           compute_q=True)
                        # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        eval_obs, eval_r, eval_done, _ = eval_env.step(
                            max_action * eval_action)
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(step) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(scalar):
                """
                check and return the input if it is a scalar, otherwise raise ValueError

                :param scalar: (Any) the object to check
                :return: (Number) the scalar if x is a scalar
                """
                if isinstance(scalar, np.ndarray):
                    assert scalar.size == 1
                    return scalar[0]
                elif np.isscalar(scalar):
                    return scalar
                else:
                    raise ValueError('expected scalar, got %s' % scalar)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = step

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as file_handler:
                        pickle.dump(env.get_state(), file_handler)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as file_handler:
                        pickle.dump(eval_env.get_state(), file_handler)
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0,
    reward_param_thr = 70, reward_param_type='const'):

    print('Start training for env: '+env_id)
    #change to your dir of choice for saving
    save_path = os.getcwd()
    print('Save data at '+save_path+'. Change to your desired path.')

    dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl'
    append_num = 0
    while os.path.exists(os.path.join(save_path,dump_name)):
        dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl'
        append_num+=1

    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_com_sav = []

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            # collect data for saving plot
            save_data = {'act': [],
                         'obs': [],
                         'qpos':[],
                         'rew':[], # reward for this episode
                         'freq_com':[], # communication frequency
                         'act_ts': [],
                         'obs_ts': [],
                         'qpos_ts': [],
                         'rew_ts': [],  # reward for this episode
                         'freq_com_ts': [],  # communication frequency
                         'comm_r_factor':reward_param_scaling,
                         'eplen_ts':[] # len of test episodes
                         }

            # decay the exploration
            e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1)
            explore_switch = (t < 20000 and eg_explore and e_greed > 0)
            print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch))

            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.

                # init u_old, don't forget to change test also
                u_old = 1.0 * env.action_space.sample() / max_action

                num_no_com = 0
                for t_rollout in range(nb_rollout_steps):

                    # Predict next action.
                    # edit this to be param version
                    a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=True, compute_Q=True)
                    a0 = a_raw[0]
                    a1 = a_raw[1]

                    # eps greedy, flip the coin
                    # make eps decay first 10k updates
                    dice_greed = np.random.uniform()
                    if explore_switch and dice_greed < e_greed:
                        com = ( np.random.uniform() > 0.5 )
                    else:
                        com = (a0 > a1)

                    # action according to com switch
                    if com:
                        r_com = 0.0
                        action = np.copy(a_raw[2:]) #motor cmd
                    else:
                       if reward_param_type=='const':
                            r_com = 1. # const reward
                        elif reward_param_type=='linear':
                            r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward
                        elif reward_param_type=='inv':
                            r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0)))  # inv decay reward
                        else:
                            print('no such reward type!')
                            assert 1==0

                        r_com = reward_param_scaling * r_com
                        action = np.copy(u_old)
                        num_no_com += 1

                    assert action.shape == env.action_space.shape

                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        pass
                        # env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(a_raw)
                    epoch_qs.append(q)

                    agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done)
                    obs = np.squeeze(new_obs)

                    save_data['act'].append(np.array(action))
                    save_data['obs'].append(np.array(obs))
                    if hasattr(env.unwrapped, 'data'):
                        save_data['qpos'].append(np.array(env.unwrapped.data.qpos))

                    u_old = np.copy(action)

                    if done:
                        # Episode done.
                        epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step))

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()


                print('communication savings: ' + str(num_no_com)) # check com number
                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()

            # log stuff
            save_data['rew'].append(np.mean(epoch_episode_rewards))
            save_data['freq_com'].append(np.mean(epoch_com_sav))

            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        ###===============================================
        # test the fully-trained agent
        env = env.unwrapped

        print('*Final testing*')
        n_test = 1
        n_ts_rollout = 500
        # obs = env.env.reset()
        for i_test in range(n_test):
            if i_test%50==0:
                print('test iteration: '+str(i_test))
            obs = env.reset()
            # take some actions
            # start with small during test time
            u_old = 0 * env.action_space.sample() / max_action

            num_no_com = 0

            ts_step = 0
            ts_reward = 0
            for i_test_rollout in range(n_ts_rollout):
                # Predict next action.
                # edit this to be param version
                a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True)
                a0 = a_raw[0]
                a1 = a_raw[1]

                com = (a0 > a1)

                # action according to com switch
                if com:
                    action = np.copy(a_raw[2:])
                else:
                    action = np.copy(u_old)
                    num_no_com += 1

                assert action.shape == env.action_space.shape

                new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # print('Done: '+str(done))
                ts_reward += r # do i really need to change this? change back to r only
                ts_step += 1


                # record trajectory                # save_data['rew'].append(np.array(r)) # need to change here, what's a good performance measure?
                save_data['act_ts'].append(max_action *action) # record the actual u
                save_data['obs_ts'].append(np.array(obs))


                u_old = np.copy(action)
                obs = np.copy(new_obs) # update obs

            # # store episode rew as performance measure
            # save_data['eplen_ts'].append(np.array(i_test_rollout+1))
            # save_data['rew_ts'].append(np.array(ts_reward))
            # save_data['freq_com_ts'].append(np.array(1.0*num_no_com/(i_test_rollout+1)))

            agent.reset() # doesn't matter if not stochastic

        # plot the trajectory
        ### states
        xs = np.asarray(save_data['obs_ts'])
        ths = np.arctan2(xs[:, 1], xs[:, 0])

        ### control
        us = np.asarray(save_data['act_ts'])

        id_seg = 0

        horz_plt = 500
        plt.figure(figsize=[15, 20])
        plt.subplot(211)
        plt.plot(ths[id_seg * horz_plt:(id_seg + 1) * horz_plt], label='th')
        plt.plot(xs[:, 2][id_seg * horz_plt:(id_seg + 1) * horz_plt], color='g', label='th_dot')
        plt.legend()
        plt.title('state plot')

        plt.subplot(212)
        plt.plot(us[id_seg * horz_plt:(id_seg + 1) * horz_plt], color='r')
        plt.title('control plot')

        plt.show()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    #print(np.abs(env.action_space.low))
    #print(np.abs(env.action_space.high))
    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    logger.info('scaling actions by {} before executing in env'.format(max_action))
    if load_memory:
        memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb"))
        '''
        samps = memoryPrev.sample(batch_size=memoryPrev.nb_entries)
        print(len(samps['obs0'][1]))
        for i in range(memoryPrev.nb_entries):
            memory.append(samps['obs0'][i], samps['actions'][i], samps['rewards'][i], samps['obs1'][i],  samps['terminals1'][i])
        '''
        print("=============memory loaded================")

    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))
    envs = [make_env(seed) for seed in range(nproc)]
    envs = SubprocVecEnv(envs)
    
    '''
     # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None
    '''
    saver=tf.train.Saver()
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=10)

    with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        if restore:
            filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(15000)+".model"
            saver.restore(sess,filename)
            print("loaded!!!!!!!!!!!!!")
            #p=[v.name for v in tf.all_variables()]
            #print(p)
        
        obs = envs.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_reward3 = 0.
        episode_step = 0
        episode_step3 = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = deque(maxlen=10)
        epoch_episode_steps3 = deque(maxlen=10)
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        learning_starts = 10000
        for epoch in range(nb_epochs):
            print("cycle-memory")
            print(max_action)
            for cycle in range(nb_epoch_cycles):
                print(cycle,"-",memory.nb_entries,end=" ")
                sys.stdout.flush()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=False)[0] for i in range(nproc)])
                    q = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=True)[1] for i in range(nproc)])
                    # action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    #assert action.shape == env.action_space.shape
                    #print(i)
                    # Execute next action in parallel.
                    if rank == 0 and render:
                        env.render()
                    #assert max_action.shape == action.shape
                    new_obs, r, done, info = envs.step(action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    #print(r)
                    #print(r[1])
                    sys.stdout.flush()
                    episode_reward += r[1]
                    #episode_reward3 += r[2]
                    episode_step += 1
                    #episode_step3 += 1
                    '''
                    if episode_step==300:
                        e=episode_step
                        re=episode_reward
                    if episode_step>300:
                        episode_step=e
                        episode_reward=re
                    '''
                    #print(episode_step)

                    book_keeping_obs=obs
                    obs = new_obs
                    #print(envs[1])
                    #print(episode_reward)
                    # Book-keeping in parallel.
                    epoch_actions.append(np.mean(action))
                    epoch_qs.append(np.mean(q))
                    for i in range(nproc):
                        agent.store_transition(book_keeping_obs[i], action[i], r[i], new_obs[i], done[i])
                        #print(done)
                        if done[i]:
                            # Episode done.
                            #print("====done====",episode_reward)
                            if i==1:
                                
                                epoch_episode_rewards.append(episode_reward)
                                #rint(epoch_episode_rewards)
                                #episode_rewards_history.append(episode_reward)
                                epoch_episode_steps.append(episode_step)
                                episode_reward = 0.
                                #episode_reward3 = 0
                                episode_step = 0
                                epoch_episodes += 1
                                episodes += 1
                            '''
                            if i==2:
                                
                                #epoch_episode_rewards.append(episode_reward3)
                                #rint(epoch_episode_rewards)
                                episode_rewards_history.append(episode_reward3)
                                epoch_episode_steps3.append(episode_step3)
                                episode_reward3 = 0
                                episode_step3 = 0
                            '''    

                            agent.reset()
                            temp = envs.reset()
                            obs[i]=temp[i]
                            
                            
                    
                

                    '''
                    Variables in TensorFlow only have values inside sessions.
                    Once the session is over, the variables are lost.
                    saver,save and saver .restore depends on session and has to be inside the 
                    session.
                    '''
                
                        

                   

                    # Train.
                    epoch_actor_losses = []
                    epoch_critic_losses = []
                    epoch_adaptive_distances = []
                    for t_train in range(nb_train_steps):
                        # Adapt param noise, if necessary.
                        if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                            distance = agent.adapt_param_noise()
                            epoch_adaptive_distances.append(distance)

                        cl, al = agent.train()
                        epoch_critic_losses.append(cl)
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_rl

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                #print(episode_rewards_history) 
            if (t)%20000 == 0:
                fname="/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryStill"+str(memory.nb_entries)+".pickle"
                pickle.dump(memory,open(fname,"wb"),protocol=-1)
            if t % 5000 == 0:
                print("=======saving interim model==========")
                filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(t)+".model"
                saver.save(sess,filename)
            mpi_size = MPI.COMM_WORLD.Get_size()
            
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps2'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/episode_steps3'] = np.mean(epoch_episode_steps3)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            
            if eval_env is not None:
                combined_stats['eval/return'] = np.mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = np.mean(eval_qs)
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
               

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            print(logdir)
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 14
0
class DDPGEnvLearner(EnvLearner):
    def __init__(self, env_in):
        EnvLearner.__init__(self, env_in)
        # from baselines.ddpg.models import Actor, Critic
        # Parse noise_type
        action_noise = None
        param_noise = None
        noise_type = 'adaptive-param_0.2'
        layer_norm = True
        nb_actions = self.state_dim
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.

        self.buff_len = 10
        self.buffer = deque(self.buff_init * self.buff_len,
                            maxlen=self.buff_len)
        obs_space = (self.buff_init[0].size * self.buff_len, )
        self.memory = Memory(limit=int(1e6),
                             action_shape=env_in.observation_space.shape,
                             observation_shape=obs_space)
        self.critic = models.Critic(layer_norm=layer_norm)
        self.actor = models.Actor(nb_actions, layer_norm=layer_norm)

        self.agent = DDPG(self.actor,
                          self.critic,
                          self.memory,
                          obs_space,
                          env_in.observation_space.shape,
                          gamma=0.99,
                          tau=0.01,
                          normalize_returns=False,
                          normalize_observations=True,
                          batch_size=64,
                          action_noise=action_noise,
                          param_noise=param_noise,
                          critic_l2_reg=1e-2,
                          actor_lr=1e-5,
                          critic_lr=1e-5,
                          enable_popart=False,
                          clip_norm=None,
                          reward_scale=1.)

    def initialize(self, session, load=False):
        self.sess = session
        if not load:
            self.sess.run(tf.global_variables_initializer())
        self.agent.initialize(self.sess)

    def train(self,
              train,
              total_steps,
              valid=None,
              log_interval=10,
              early_stopping=-1,
              saver=None,
              save_str=None):
        G, yS, yR, yD, X, S, A = self.__prep_data__(train, batch_size=0)
        X = X[0]
        S = S[0]
        self.agent.reset()
        # max_action = self.env.action_space.high
        batch_size = 64
        t = 0
        episode_reward = 0
        episode_step = 0
        episodes = 0
        epoch_episodes = 0
        epoch_episode_rewards = []
        nb_epoch_cycles = 10
        nb_rollout_steps = 100
        nb_epochs = int(len(train) / (nb_epoch_cycles * nb_rollout_steps))

        nb_train_steps = total_steps
        param_noise_adaption_interval = 50
        i = 0

        for epoch in range(nb_epochs):
            start_time = time.time()
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.

                    # (obs_in, action_in, _, new_obs_in, done, episode_step) = train[i]

                    # obs = np.array([np.concatenate([obs_in/self.state_mul_const,
                    #                                 action_in/self.act_mul_const])]).flatten()
                    obs = X[i]
                    done = train[i][4]
                    action, q = self.agent.pi(obs,
                                              apply_noise=True,
                                              compute_Q=True)
                    r = -np.linalg.norm(S[i] / self.state_mul_const -
                                        action) / action.shape[0]

                    # if not done and i < len(train):
                    #     new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const,
                    #                                     train[i][1] / self.act_mul_const])]).flatten()
                    # else:
                    #     new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const,
                    #                                     np.zeros_like(action_in)])]).flatten()
                    if i < len(train):
                        new_obs = X[i + 1]
                    else:
                        new_obs = np.zeros_like(X[i])
                    t += 1
                    i += 1
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    self.agent.store_transition(obs, action, r, new_obs, done)

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_reward = 0.
                        epoch_episodes += 1
                        episodes += 1

                        self.agent.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = self.agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = self.agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    self.agent.update_target_net()
            print('Epoch ' + str(epoch) + '/' + str(nb_epochs) +
                  ' with avg rew of: ' +
                  str(sum(epoch_episode_rewards) /
                      len(epoch_episode_rewards)) + ' in ' +
                  str(time.time() - start_time) + 's')
            if epoch % log_interval == 0 and epoch > 0:
                if saver is not None and save_str is not None:
                    save_path = saver.save(self.sess,
                                           'models/' + str(save_str) + '.ckpt')
                    print("Model saved in path: %s" % save_path)
        if saver is not None and save_str is not None:
            save_path = saver.save(self.sess,
                                   'models/' + str(save_str) + '.ckpt')
            print("Model saved in path: %s" % save_path)

    def step(self, obs_in, action_in, episode_step, save=True, buff=None):
        import copy
        obs = obs_in / self.state_mul_const
        action = action_in / self.act_mul_const
        if save:
            if episode_step == 0:
                self.buffer = deque(self.buff_init * self.buff_len,
                                    maxlen=self.buff_len)
            self.buffer.append(
                np.array([np.concatenate([obs, action])]).flatten())
        else:
            if buff is None:
                buff = copy.copy(self.buffer)
            if episode_step == 0:
                buff = deque(self.buff_init * self.buff_len,
                             maxlen=self.buff_len)
            buff.append(np.array([np.concatenate([obs, action])]).flatten())

        if buff is not None:
            x = np.array([np.concatenate(buff).flatten()])[0]
        else:
            x = np.array([np.concatenate(self.buffer).flatten()])[0]
        new_obs, _ = self.agent.pi(x, apply_noise=True, compute_Q=True)
        return new_obs
def train_return(env,
                 param_noise,
                 actor,
                 critic,
                 memory,
                 nb_epochs=250,
                 nb_epoch_cycles=20,
                 reward_scale=1.,
                 render=False,
                 normalize_returns=False,
                 normalize_observations=True,
                 critic_l2_reg=1e-2,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 action_noise=None,
                 popart=False,
                 gamma=0.99,
                 clip_norm=None,
                 nb_train_steps=50,
                 nb_rollout_steps=2048,
                 batch_size=64,
                 tau=0.01,
                 param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    # Set up logging stuff only for a single worker.

    episode_rewards_history = deque(maxlen=100)
    #with U.single_threaded_session() as sess:
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        print('epoch number:', epoch)
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                assert action.shape == env.action_space.shape

                # Execute next action.
                if rank == 0 and render:
                    env.render()
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(
                    max_action * action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()
    return agent
Ejemplo n.º 16
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 17
0
def main():
    with U.single_threaded_session() as sess:
        batch_size = 64
        current_noise_type = 'adaptive-param_0.2'
        _, stddev = current_noise_type.split('_')
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        param_noise_adaption_interval = 2
        env = gym.make("Pendulum-v0")

        nb_actions = env.action_space.shape[-1]
        layer_norm = True

        # Configure components.
        memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
        critic = Critic(layer_norm=layer_norm)
        actor = Actor(nb_actions, layer_norm=layer_norm)

        # Seed everything to make things reproducible.
        seed = int(1000000 * np.random.rand())
        logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
        tf.set_random_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        env.seed(seed)

        max_action = env.action_space.high
        logger.info('scaling actions by {} before executing in env'.format(max_action))
        agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                     batch_size=batch_size, param_noise=param_noise)
        logger.info('Using agent with the following configuration:')
        logger.info(str(agent.__dict__.items()))

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()
        agent.reset()
        obs = env.reset()
        for t in itertools.count():
            episode_rewards = []
            done = False
            while not done:
                env.render()

                # Take action and update exploration to the newest value
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                new_obs, rew, done, _ = env.step(max_action * action)

                # Book-keeping.
                agent.store_transition(obs, action, rew, new_obs, done)
                obs = new_obs

                episode_rewards.append(rew)
                if done:
                    agent.reset()
                    obs = env.reset()

            nb_train_steps = 100
            epoch_adaptive_distances = []
            epoch_critic_losses = []
            epoch_actor_losses = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            if t % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1))
                logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses)))
                logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses)))
                logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances)))
                logger.dump_tabular()
Ejemplo n.º 18
0
class DDPGAgent(BaseAgent):
    """A Deep Deterministic Policy Gradient implementation of an SC2 agent."""
    def __init__(self):
        super(DDPGAgent, self).__init__()
        return

    def setup(self,
              obs_shape,
              nb_actions,
              action_spec,
              noise_type,
              gamma=1.,
              tau=0.01,
              layer_norm=True):
        super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec,
                                     noise_type, gamma, tau, layer_norm)

        self.action_spec_internal = action_spec
        self.obs_dim = obs_shape
        action_noise = None
        param_noise = None

        # Parse noise_type
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.
        self.memory = Memory(limit=int(500),
                             action_shape=(nb_actions, ),
                             observation_shape=obs_shape)
        self.critic = Critic(layer_norm=layer_norm, hidden_size=128)
        self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128)

        tf.reset_default_graph()

        # max_action = env.action_space.high
        self.ddpg = DDPG(actor=self.actor,
                         critic=self.critic,
                         memory=self.memory,
                         observation_shape=obs_shape,
                         action_shape=(nb_actions, ),
                         gamma=gamma,
                         tau=tau,
                         action_noise=action_noise,
                         param_noise=param_noise)

    def step(self, obs):
        super(DDPGAgent, self).step(obs)
        acts, q = self.ddpg.pi(obs, apply_noise=True, compute_Q=True)
        # Move distribution from [-1, 1] to [0, 2] and convert to z-score
        actions_z = (2 - (acts + 1)) / 2
        return actions_z, q

    def reset(self):
        super(DDPGAgent, self).reset()
        self.ddpg.reset()

    def initialize(self, sess):
        super(DDPGAgent, self).initialize(sess)
        self.ddpg.initialize(sess)

    def store_transition(self, obs, action, r, new_obs, done):
        super(DDPGAgent, self).store_transition(obs, action, r, new_obs, done)
        self.ddpg.store_transition(obs, action, r, new_obs, done)

    def train(self):
        super(DDPGAgent, self).train()
        return self.ddpg.train()

    def adapt_param_noise(self):
        super(DDPGAgent, self).adapt_param_noise()
        return self.ddpg.adapt_param_noise()

    def backprop(self):
        super(DDPGAgent, self).backprop()
        self.ddpg.update_target_net()

    def get_memory_size(self):
        super(DDPGAgent, self).get_memory_size()
        return self.memory.nb_entries

    @property
    def action_spec(self):
        return self.action_spec_internal

    @property
    def obs_shape(self):
        return self.obs_dim
Ejemplo n.º 19
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          save_model,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
        if not os.path.exists(os.path.join(logger.get_dir(), 'model')):
            os.makedirs(os.path.join(logger.get_dir(), 'model'))
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.make_session(
            num_cpu=4) as sess:  # U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 20
0
def train(env,
          num_timesteps,
          nb_trials,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          test_interval,
          batch_size,
          memory,
          output,
          load_file,
          save=False,
          tau=0.01,
          evaluation=False,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    observation_range = [env.observation_space.low, env.observation_space.high]
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 observation_range=observation_range)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    trial_return_history = deque(maxlen=100)
    eval_trial_return_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        #dir_path = os.path.dirname(os.path.realpath(__file__))
        #tf.summary.FileWriter(dir_path, sess.graph)

        trial = 0
        ts = 0

        if load_file != '':
            saver.restore(sess, load_file)

        start_time = time.time()

        trial_returns = []
        trial_steps = []
        actions = []
        qs = []
        train_actor_losses = []
        train_critic_losses = []
        train_adaptive_distances = []

        while True:
            test = (test_interval >= 0
                    and trial % (test_interval + 1) == test_interval)

            if not test:
                # Perform rollout.
                env.set_test(test=False)
                obs = env.reset()
                agent.reset()
                done = 0
                trial_return = 0.
                trial_step = 0
                while done == 0:
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    ts += 1
                    if rank == 0 and render:
                        env.render()
                    trial_return += r
                    trial_step += 1

                    # Book-keeping.
                    actions.append(action)
                    qs.append(q)
                    agent.store_transition(
                        obs, action, r, new_obs,
                        done == 2)  # terminal indicator is 2
                    obs = new_obs

                    # Train.
                    if memory.nb_entries >= batch_size:
                        for t_train in range(nb_train_steps):
                            # Adapt param noise, if necessary.
                            if trial % param_noise_adaption_interval == 0:
                                distance = agent.adapt_param_noise()
                                train_adaptive_distances.append(distance)

                            cl, al = agent.train()
                            train_critic_losses.append(cl)
                            train_actor_losses.append(al)
                            agent.update_target_net()

                # Episode done.
                trial_steps.append(trial_step)
                trial_returns.append(trial_return)
                trial_return_history.append(trial_return)

            else:
                # Evaluate.
                eval_trial_return = 0.
                eval_trial_steps = 0
                if evaluation is not None:
                    env.set_test(test=True)
                    eval_obs = env.reset()
                    agent.reset()
                    eval_done = 0
                    while eval_done == 0:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            env.render()
                        eval_trial_return += eval_r
                        eval_trial_steps += 1
                    # Episode done.
                    eval_trial_return_history.append(eval_trial_return)

                # Log stats.
                duration = time.time() - start_time
                combined_stats = {}
                if memory.nb_entries > 0:
                    # Print only if learing was happaning
                    stats = agent.get_stats()
                    for key in sorted(stats.keys()):
                        combined_stats[key] = mpi_mean(stats[key])

                    # Rollout statistics.
                    combined_stats['rollout/Q_mean'] = mpi_mean(qs)
                    combined_stats['rollout/actions_mean'] = mpi_mean(actions)
                    combined_stats['rollout/actions_std'] = mpi_std(actions)
                    combined_stats['rollout/trial_steps'] = mpi_mean(
                        trial_steps)
                    combined_stats['rollout/return'] = mpi_mean(trial_returns)
                    combined_stats['rollout/return_history'] = mpi_mean(
                        trial_return_history)

                    # Train statistics.
                    combined_stats['train/loss_actor'] = mpi_mean(
                        train_actor_losses)
                    combined_stats['train/loss_critic'] = mpi_mean(
                        train_critic_losses)
                    combined_stats['train/param_noise_distance'] = mpi_mean(
                        train_adaptive_distances)

                # Evaluation statistics.
                if evaluation is not None:
                    combined_stats['eval/Q'] = mpi_mean(eval_q)
                    combined_stats['eval/return'] = eval_trial_return
                    combined_stats['eval/return_history'] = mpi_mean(
                        eval_trial_return_history)
                    combined_stats['eval/steps'] = eval_trial_steps

                # Total statistics.
                combined_stats['total/duration'] = mpi_mean(duration)
                combined_stats['total/steps_per_second'] = mpi_mean(
                    float(ts) / float(duration))
                combined_stats['total/trials'] = trial
                combined_stats['total/steps'] = ts

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()
                if rank == 0 and logdir:
                    if hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)
                    if evaluation and hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'eval_env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)

                # Reset statistics.
                trial_returns = []
                trial_steps = []
                actions = []
                qs = []
                train_actor_losses = []
                train_critic_losses = []
                train_adaptive_distances = []
                # End of evaluate and log statistics

            # Check if this is the last trial
            trial += 1
            if nb_trials and trial >= nb_trials:
                break
            if num_timesteps and ts >= num_timesteps:
                break

        # Saving policy and value function
        if save and saver and output != '':
            saver.save(sess, './%s' % output)