Example #1
0
def train(env_id, num_frames, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)


    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json"%rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
Example #2
0
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Example #3
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)
    env = make_atari(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Example #4
0
def test_function():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        y = tf.placeholder(tf.int32, (), name="y")
        z = 3 * x + 2 * y
        lin = function([x, y], z, givens={y: 0})

        with single_threaded_session():
            initialize()

            assert lin(2) == 6
            assert lin(2, 2) == 10
Example #5
0
def test_multikwargs():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        with tf.variable_scope("other"):
            x2 = tf.placeholder(tf.int32, (), name="x")
        z = 3 * x + 2 * x2

        lin = function([x, x2], z, givens={x2: 0})
        with single_threaded_session():
            initialize()
            assert lin(2) == 6
            assert lin(2, 2) == 10
Example #6
0
def test_function():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    y = tf.placeholder(tf.int32, (), name="y")
    z = 3 * x + 2 * y
    lin = function([x, y], z, givens={y: 0})

    with single_threaded_session():
        initialize()

        assert lin(2) == 6
        assert lin(x=3) == 9
        assert lin(2, 2) == 10
        assert lin(x=2, y=3) == 12
Example #7
0
def test_set_value():
    a = tf.Variable(42.)
    with single_threaded_session():
        set_value(a, 5)
        assert a.eval() == 5
        g = tf.get_default_graph()
        g.finalize()
        set_value(a, 6)
        assert a.eval() == 6

        # test the test
        try:
            assert a.eval() == 7
        except AssertionError:
            pass
        else:
            assert False, "assertion should have failed"
Example #8
0
def test_multikwargs():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    with tf.variable_scope("other"):
        x2 = tf.placeholder(tf.int32, (), name="x")
    z = 3 * x + 2 * x2

    lin = function([x, x2], z, givens={x2: 0})
    with single_threaded_session():
        initialize()
        assert lin(2) == 6
        assert lin(2, 2) == 10
        expt_caught = False
        try:
            lin(x=2)
        except AssertionError:
            expt_caught = True
        assert expt_caught
Example #9
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
Example #10
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
            hid_size=32, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
def train(env,
          nb_epochs,
          nb_episodes,
          episode_length,
          nb_train_steps,
          eval_freq,
          nb_eval_episodes,
          actor,
          critic,
          memory,
          gamma,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          clip_norm,
          batch_size,
          reward_scale,
          tau=0.01):
    """
    Parameters
    ----------
    nb_epochs : the number of epochs to train.

    nb_episodes : the number of episodes for each epoch.

    episode_length : the maximum number of steps for each episode.

    gamma : discount factor.

    tau : soft update coefficient.

    clip_norm : clip on the norm of the gradient.
    """
    # Initialize DDPG agent (target network and replay buffer)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=None,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    # We need max_action because the NN output layer is a tanh.
    # So we must scale it back.
    max_action = env.action_space.high

    with U.single_threaded_session() as sess:
        agent.initialize(sess)

        # Setup summary writer
        writer = _setup_tf_summary()
        writer.add_graph(sess.graph)

        stats = EvaluationStatistics(tf_session=sess, tf_writer=writer)
        sess.graph.finalize()

        global_step = 0
        obs = env.reset()
        agent.reset()
        for epoch in range(nb_epochs):
            for episode in range(nb_episodes):
                obs = env.reset()
                # Generate a trajectory
                for t in range(episode_length):
                    # Select action a_t according to current policy and
                    # exploration noise
                    a_t, _ = agent.pi(obs, apply_noise=True, compute_Q=False)
                    assert a_t.shape == env.action_space.shape

                    # Execute action a_t and observe reward r_t and next state s_{t+1}
                    new_obs, r_t, done, info = env.step(max_action * a_t)

                    # Store transition in the replay buffer
                    agent.store_transition(obs, a_t, r_t, new_obs, done)
                    obs = new_obs

                    if done:
                        agent.reset()
                        obs = env.reset()
                        break  # End episode

                # Training phase
                for t_train in range(nb_train_steps):
                    critic_loss, actor_loss = agent.train()
                    agent.update_target_net()

                    # Plot statistics
                    stats.add_critic_loss(critic_loss, global_step)
                    stats.add_actor_loss(actor_loss, global_step)
                    global_step += 1

                # Evaluation phase
                if episode % eval_freq == 0:
                    # Generate evaluation trajectories
                    for eval_episode in range(nb_eval_episodes):
                        obs = env.reset()
                        for t in range(episode_length):
                            env.render()

                            # Select action a_t according to current policy and
                            # exploration noise
                            a_t, _ = agent.pi(obs,
                                              apply_noise=False,
                                              compute_Q=False)
                            assert a_t.shape == env.action_space.shape

                            # Execute action a_t and observe reward r_t and next state s_{t+1}
                            obs, r_t, eval_done, info = env.step(max_action *
                                                                 a_t)
                            stats.add_reward(r_t)

                            if eval_done:
                                obs = env.reset()
                                break

                    # Plot average reward
                    stats.plot_reward(global_step)
Example #12
0
def evaluate(env,
             nb_episodes,
             reward_scale,
             render,
             param_noise,
             action_noise,
             actor,
             critic,
             memory,
             critic_l2_reg,
             normalize_returns=False,
             normalize_observations=True,
             weight_file=None):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    with U.single_threaded_session() as sess:
        agent.initialize(sess)
        if weight_file:
            saver = tf.train.Saver(actor.trainable_vars +
                                   critic.trainable_vars)
            saver.restore(sess, weight_file)
            agent.actor_optimizer.sync()
            agent.critic_optimizer.sync()
        # sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        total_reward = 0.0
        max_steps = 2000
        for ep in range(nb_episodes):
            i = 0
            done = False
            episode_reward = 0.0
            while not done and i < max_steps:
                action, q, all_actions, sample = agent.pi(obs,
                                                          apply_noise=False,
                                                          compute_Q=True)
                assert action.shape == env.action_space.shape

                assert max_action.shape == action.shape
                obs, r, done, info = env.step(max_action * action)
                episode_reward += r
                # env.render()
                # print('Action:{}, reward:{}'.format(action, r))
                # time.sleep(0.1)
                i += 1
            total_reward += episode_reward
            logger.info("Episode:{}, reward:{}, steps:{}".format(
                ep, episode_reward, i))
            if done:
                obs = env.reset()

        logger.info("Average reward:{}, total reward:{}, episodes:{}".format(
            (total_reward / nb_episodes), total_reward, nb_episodes))
Example #13
0
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           temperature,
           prioritization,
           binding,
           version,
           dump_buffer,
           n_cycles,
           rank_method,
           w_potential,
           w_linear,
           w_rotational,
           clip_energy,
           override_params={},
           save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        #whoami = mpi_fork(num_cpu, binding)
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)
    """
    if logging: 
        logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\
                 '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\
                 '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\
                 '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\
                 '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\
                 '-clip_energy'+str(clip_energy)+\
                 '-version'+str(version)
    else:
        logdir = osp.join(tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))

    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)
    """

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    params['temperature'] = temperature
    params['prioritization'] = prioritization
    params['binding'] = binding
    params['max_timesteps'] = n_epochs * params['n_cycles'] * params[
        'n_batches'] * num_cpu
    params['version'] = version
    params['dump_buffer'] = dump_buffer
    params['n_cycles'] = n_cycles
    params['rank_method'] = rank_method
    params['w_potential'] = w_potential
    params['w_linear'] = w_linear
    params['w_rotational'] = w_rotational
    params['clip_energy'] = clip_energy
    params['n_epochs'] = n_epochs
    params['num_cpu'] = num_cpu

    if params['dump_buffer']:
        params['alpha'] = 0

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          num_cpu=num_cpu,
          dump_buffer=dump_buffer,
          w_potential=params['w_potential'],
          w_linear=params['w_linear'],
          w_rotational=params['w_rotational'],
          rank_method=rank_method,
          clip_energy=clip_energy)
Example #14
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Example #15
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    num_cpu = 1
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
            print("fancy call succeeded")
        except CalledProcessError:
            print("fancy version of mpi call failed, try simple version")
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()

    # Configure logging
    rank = MPI.COMM_WORLD.Get_rank()
    logdir = ''
    if rank == 0:
        if logdir or logger_b.get_dir() is None:
            logger_b.configure(dir=logdir)
    else:
        logger_b.configure()
    logdir = logger_b.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = v['seed'] + 1000000 * rank
    set_global_seeds(rank_seed)

    def make_env():
        return PnPEnv()

    env = make_env()
    test_env = make_env()
    env.reset()

    # for _ in range(1000):
    #     env.render()
    #     import pdb; pdb.set_trace()
    #     env.step(env.action_space.sample())

    params = config.DEFAULT_PARAMS
    params['action_l2'] = v['action_l2']
    params['max_u'] = v['max_u']
    params['gamma'] = v['discount']
    params['env_name'] = 'FetchReach-v0'
    params['replay_strategy'] = v['replay_strategy']
    params['lr'] = v['lr']
    params['layers'] = v['layers']
    params['hidden'] = v['hidden']
    params['n_cycles'] = v['n_cycles']  # cycles per epoch
    params['n_batches'] = v['n_batches']  # training batches per cycle
    params['batch_size'] = v[
        'batch_size']  # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
    params['n_test_rollouts'] = v[
        'n_test_rollouts']  # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
    # exploration
    params['random_eps'] = 0.3  # percentage of time a random action is taken
    params['noise_eps'] = v['action_noise']
    params['goal_weight'] = v['goal_weight']
    params['scope'] = 'ddpg3'

    params['sample_expert'] = v['sample_expert']
    params['expert_batch_size'] = v['expert_batch_size']
    params['bc_loss'] = v['bc_loss']
    params['anneal_bc'] = v['anneal_bc']
    params['gail_weight'] = v['gail_weight']
    params['terminate_bootstrapping'] = v['terminate_bootstrapping']
    params['mask_q'] = int(v['mode'] == 'pure_bc')
    params['two_qs'] = v['two_qs']
    params['anneal_discriminator'] = v['anneal_discriminator']
    params['two_rs'] = v['two_qs'] or v['anneal_discriminator']
    params['with_termination'] = v['rollout_terminate']

    if 'clip_dis' in v and v['clip_dis']:
        params['dis_bound'] = v['clip_dis']

    with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    params['T'] = v['horizon']
    params['to_goal'] = v['to_goal']

    params = config.prepare_params(params)
    params['make_env'] = make_env
    config.log_params(params, logger=logger_b)

    dims = config.configure_dims(params)

    # prepare GAIL
    if v['use_s_p']:
        discriminator = GAIL(dims['o'] + dims['o'] +
                             dims['g'] if not v['only_s'] else dims['o'] +
                             dims['g'],
                             dims['o'],
                             dims['o'],
                             dims['g'],
                             0.,
                             gail_loss=v['gail_reward'],
                             use_s_p=True,
                             only_s=v['only_s'])
    else:
        discriminator = GAIL(dims['o'] + dims['u'] +
                             dims['g'] if not v['only_s'] else dims['o'] +
                             dims['g'],
                             dims['o'],
                             dims['u'],
                             dims['g'],
                             0.,
                             gail_loss=v['gail_reward'],
                             only_s=v['only_s'])
    params['discriminator'] = discriminator

    # configure replay buffer for expert buffer
    params_expert = {
        k: params[k]
        for k in [
            'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs',
            'with_termination'
        ]
    }
    params_expert[
        'replay_strategy'] = 'future' if v['relabel_expert'] else 'none'

    params_policy_buffer = {
        k: params[k]
        for k in [
            'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs',
            'with_termination'
        ]
    }
    params_policy_buffer['replay_strategy'] = 'future'

    params_empty = {
        k: params[k]
        for k in [
            'make_env', 'replay_k', 'discriminator', 'gail_weight',
            'replay_strategy'
        ]
    }

    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=v['clip_return'],
                                   reuse=tf.AUTO_REUSE,
                                   env=env,
                                   to_goal=v['to_goal'])

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': True,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate'],
        'to_goal': v['to_goal']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate'],
        'to_goal': v['to_goal']
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker([env], policy, dims, logger_b,
                                   **rollout_params)
    # rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker([env], policy, dims, logger_b, **eval_params)
    # evaluator.seed(rank_seed)

    n_traj = v['n_evaluation_traj']

    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()
    inner_log_dir = osp.join(log_dir, 'inner_iters')
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)
    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    logger.log("Starting the outer iterations")

    logger.log("Generating heat map")

    def evaluate_pnp(env, policy, n_rollouts=100):
        goal_reached = []
        distance_to_goal = []
        for i in range(n_rollouts):
            traj = rollout(env,
                           policy,
                           max_path_length=v['horizon'],
                           using_gym=True)
            goal_reached.append(np.max(traj['env_infos']['goal_reached']))
            distance_to_goal.append(np.min(traj['env_infos']['distance']))

        return np.mean(goal_reached), np.mean(distance_to_goal)

    from sandbox.experiments.goals.pick_n_place.pnp_expert import PnPExpert

    expert_policy = PnPExpert(env)

    expert_params = {
        'exploit': not v['noisy_expert'],
        'use_target_net': False,
        'use_demo_states': False,
        'compute_Q': False,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate'],
        'to_goal': v['to_goal']
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        expert_params[name] = params[name]

    expert_params['noise_eps'] = v['expert_noise']
    expert_params['random_eps'] = v['expert_eps']

    expert_worker = RolloutWorker([env], expert_policy, dims, logger_b,
                                  **expert_params)

    input_shapes = dims_to_shapes(dims)
    expert_sample_transitions = config.configure_her(params_expert)
    buffer_shapes = {
        key:
        (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key])
        for key, val in input_shapes.items()
    }
    buffer_shapes['g'] = (buffer_shapes['g'][0],
                          3 if not v['full_space_as_goal'] else 6)
    buffer_shapes['ag'] = (v['horizon'] + 1,
                           3 if not v['full_space_as_goal'] else 6)
    buffer_shapes['successes'] = (v['horizon'], )
    expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'],
                                 expert_sample_transitions)
    policy.expert_buffer = expert_buffer

    sample_transitions_relabel = config.configure_her(params_policy_buffer)

    for _ in range(v['num_demos']):
        # rollout is generated by expert policy
        episode = expert_worker.generate_rollouts(
            slice_goal=(3, 6) if v['full_space_as_goal'] else None)
        # and is stored into the current expert buffer
        expert_buffer.store_episode(episode)

        # TODO: what is subsampling_rate
    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            tf.get_default_session().run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    init_new_vars_op = tf.initialize_variables(uninitialized_vars)
    tf.get_default_session().run(init_new_vars_op)

    max_success, min_distance = evaluate_pnp(env, policy)
    outer_iter = 0
    logger.record_tabular("Outer_iter", outer_iter)
    logger.record_tabular("Outer_Success", max_success)
    logger.record_tabular("MinDisToGoal", min_distance)
    logger.dump_tabular()

    for outer_iter in range(1, v['outer_iters']):
        logger.log("Outer itr # %i" % outer_iter)

        with ExperimentLogger(inner_log_dir,
                              outer_iter,
                              snapshot_mode='last',
                              hold_outter_log=True):
            train(
                policy,
                discriminator,
                rollout_worker,
                v['inner_iters'],
                v['n_cycles'],
                v['n_batches'],
                v['n_batches_dis'],
                policy.buffer,
                expert_buffer,
                empty_buffer=empty_buffer if v['on_policy_dis'] else None,
                num_rollouts=v['num_rollouts'],
                feasible_states=feasible_states if v['query_expert'] else None,
                expert_policy=expert_policy if v['query_expert'] else None,
                agent_policy=policy if v['query_agent'] else None,
                train_dis_per_rollout=v['train_dis_per_rollout'],
                noise_expert=v['noise_dis_agent'],
                noise_agent=v['noise_dis_expert'],
                sample_transitions_relabel=sample_transitions_relabel
                if v['relabel_for_policy'] else None,
                outer_iter=outer_iter,
                annealing_coeff=v['annealing_coeff'],
                q_annealing=v['q_annealing'])

        print("evaluating policy performance")

        logger.log("Generating heat map")

        success, min_distance = evaluate_pnp(env, policy)

        logger.record_tabular("Outer_iter", outer_iter)
        logger.record_tabular("Outer_Success", max_success)
        logger.record_tabular("MinDisToGoal", min_distance)
        logger.dump_tabular()

        if success > max_success:
            print("% f >= %f, saving policy to params_best" %
                  (success, max_success))
            with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f:
                cloudpickle.dump({'env': env, 'policy': policy}, f)
            max_success = success

        report.save()
        report.new_row()
Example #16
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          teacher,
          tau=0.01,
          eval_env=True,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    t = datetime.now().strftime('%H-%M')
    PATH = 'results/ddpg'.format(t)

    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        agent.restore_model(PATH)
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)

                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            action)
                        eval_env.background = get_q_background(
                            eval_env, agent.q, eval_action)

                        # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            agent.save_model(PATH, epoch)
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Example #17
0
def train(args, seed, writer=None):
    from baselines.ppo1 import pposgd_simple_gcn, gcn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    if args.env == 'molecule':
        env = gym.make('molecule-v0')
        env.init(
            data_type=args.dataset,
            logp_ratio=args.logp_ratio,
            qed_ratio=args.qed_ratio,
            sa_ratio=args.sa_ratio,
            recons_ratio=args.recons_ratio,
            reward_step_total=args.reward_step_total,
            is_normalize=args.normalize_adj,
            reward_type=args.reward_type,
            reward_target=args.reward_target,
            has_feature=bool(args.has_feature),
            is_conditional=bool(args.is_conditional),
            conditional=args.conditional,
            max_action=args.max_action,
            min_action=args.min_action)  # remember call this after gym.make!!
    elif args.env == 'graph':
        env = GraphEnv()
        env.init(reward_step_total=args.reward_step_total,
                 is_normalize=args.normalize_adj,
                 dataset=args.dataset)  # remember call this after gym.make!!
    print(env.observation_space)

    # if not os.path.exists(args.traj_data_path):
    #     env.store_all_expert_trajs(args)

    def policy_fn(name, ob_space, ac_space):
        return gcn_policy.GCNPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    atom_type_num=env.atom_type_num,
                                    char_type_num=len(env.smile_chars),
                                    args=args)

    env.seed(workerseed)
    #print(device_lib.list_local_devices())
    pposgd_simple_gcn.learn(args,
                            env,
                            policy_fn,
                            max_timesteps=args.num_steps,
                            timesteps_per_actorbatch=256,
                            clip_param=0.2,
                            entcoeff=0.01,
                            optim_epochs=8,
                            optim_stepsize=args.lr,
                            optim_batchsize=32,
                            gamma=1,
                            lam=0.95,
                            schedule='linear',
                            writer=writer)
    env.close()
Example #18
0
def main():
    '''
    Load and play trained policy
    '''
    log_root = os.path.join(os.getcwd(), 'logs')
    extra_args = ExtraArgs(log_root=log_root)

    env = make_mujoco_env(extra_args.env_id, extra_args.seed)

    if isinstance(env.unwrapped, CeresEnv) and (len(extra_args.trained_cnet) > 0):
        env.unwrapped.init_ceres()
        env.unwrapped.init_constraint_prediction(extra_args.trained_cnet)

    episode_lengths = np.zeros(extra_args.max_episodes)
    episode_rewards = np.zeros(extra_args.max_episodes)
    ob = env.reset()

    do_save_render = extra_args.render and len(extra_args.save_render) > 0
    if do_save_render:
        os.makedirs(extra_args.save_render, exist_ok=True)

    def save_render(i_step, max_step=300, verbose=True):
        n_digits = len(str(max_step))
        do_save_step = (max_step <= 0) or (i_step <= max_step)
        if do_save_render and do_save_step:
            path_save = os.path.join(extra_args.save_render, str(i_step).zfill(n_digits) + '.png')
            env.unwrapped.save_render(path_save, verbose=verbose)

    ob_space = env.unwrapped.observation_space
    ac_space = env.unwrapped.action_space
    ob_space, policy_observation_filter= build_policy_observation_filter(extra_args, ob_space)

    env.unwrapped.set_ineq_margin(extra_args.conservative_exploration)

    if len(extra_args.trained_policy) > 0:
        assert os.path.exists(extra_args.trained_policy), 'Invalid path to model: \'{0}\''.format(extra_args.trained_policy)
        from ceres.baselines.ceres.mlp_policy_saver import MlpPolicySaver
        from baselines.common import tf_util as U
        sess = U.single_threaded_session()
        sess.__enter__()

        def policy_fn(name, ob_space, ac_space):
            return MlpPolicySaver(name, ob_space=ob_space, ac_space=ac_space,
                hid_size=extra_args.policy_hidden_size, num_hid_layers=extra_args.policy_hidden_layers)
        pi = policy_fn('pi', ob_space, ac_space)

        U.initialize()
        pi.restore_model(extra_args.trained_policy, session=sess)
    else:
        print('Invalid model path \'{0}\', use dummy agent'.format(extra_args.trained_policy))
        pi = DummyPolicy('pi', ob_space, ac_space)

    time_total = 0.
    n_steps_global = -1
    for i_episode in range(extra_args.max_episodes):
        print('Episode {0}'.format(i_episode))
        time_episode_begin = time.time()
        ob = policy_observation_filter(ob)
        n_steps_global += 1
        if extra_args.render:
            env.render()
            save_render(n_steps_global)
        done = False
        ep_rew = 0.
        i_step = 0
        time.sleep(extra_args.play_step_duration)
        
        while not done:
            action, vpred = pi.act(True, ob)
            ob, rew, done, info = env.step(action)
            ob = policy_observation_filter(ob)
            ep_rew += rew
            i_step += 1
            n_steps_global += 1
            if extra_args.render:
                env.render()
                save_render(n_steps_global)
            time.sleep(extra_args.play_step_duration)
        episode_lengths[i_episode] = i_step
        episode_rewards[i_episode] = ep_rew
        time_episode = time.time() - time_episode_begin
        time_total += time_episode
        print('  Episode length: {0} (average {1:.1f}), episode reward {2:.1f} (average {5:.1f}), duration {3:.1f} ms (average {4:.1f})'.format(i_step, np.average(episode_lengths[:i_episode+1]), ep_rew, 1000.*time_episode, 1000.*time_total/(i_episode+1), np.average(episode_rewards[:i_episode+1])))
        ob = env.reset()
Example #19
0
def testModelPolicy(env,
                    policy,
                    eval_steps=4,
                    gamma=1,
                    render=False,
                    checkpoint_file="tf_checkpoint/general/model.ckpt",
                    restore_variables=False,
                    save_variables=True,
                    logdir=None,
                    log=False,
                    overwrite_log=False,
                    theta=5,
                    use_gp_env=False,
                    gp_env=None,
                    **kwargs):
    states = list()
    next_states = list()
    rewards = list()
    actions_one_hot = list()
    actions = list()
    timesteps = list()
    mask = None

    # statistics
    wins = 0
    reward_list = list()
    paths = list()
    small_vel = 0
    obs_size = 2
    state_tf = tf.placeholder(tf.float32, (None, obs_size), name="states")
    policy_tf, _ = policy(state_tf)
    n_actions = 2

    # Start TF session
    with U.single_threaded_session() as sess:
        # to save variables
        saver = tf.train.Saver()

        # initialize all
        if restore_variables:
            # Add ops to save and restore all the variables.
            saver.restore(sess, tf.train.latest_checkpoint(checkpoint_file))
        else:
            init = tf.global_variables_initializer()
            sess.run(init)

        # make sure all variables are initialized
        sess.run(tf.assert_variables_initialized())
        pi = make_pi(policy_tf, sess, state_tf, n_actions)

        for n in range(10):
            paths.append(list())
            rewards_i = list()
            states_i = list()
            next_states_i = list()
            mask_i = list()
            actions_i_one_hot = list()
            actions_i = list()
            done = False

            # gamma_cum is gamma^t
            gamma_cum = 1
            gamma = 1
            cum_reward = 0
            reward = 0
            timesteps_i = 0

            # Sampling logic
            state = env.reset()
            paths[n].append(state)
            while not done:

                # Select action a_t according to current policy
                a_t = pi(state)
                env.render()

                newState, reward, done, info = env.step(a_t)

                # add to the buffer to remember
                # rewards_i.append(reward*gamma_cum)
                rewards.append(reward * gamma_cum)
                paths[n].append(newState)

                # works with two actions
                # actions_i.append(a_t-1)
                actions.append(a_t - 1)

                # create a one hot vector with the taken action and add to the action matrix
                action_blank = np.zeros(n_actions)
                action_blank[a_t] = 1
                # actions_i_one_hot.append(action_blank)
                actions_one_hot.append(action_blank)

                # calculation of the reward
                cum_reward += reward * gamma_cum
                gamma_cum = gamma_cum * gamma

                # states_i.append(np.append(np.append(state,action),theta))
                states_i.append(state)
                next_states_i.append(np.array(newState - state))
                state = newState

                timesteps_i += 1

                if info["goal_reached"]:
                    wins += 1
                    print(gamma_cum)
                if info["small_vel"]:
                    print("Small vel")
                    small_vel += 1

            states.append(states_i)
            next_states.append(next_states_i)
            # rewards.append(rewards_i)
            timesteps.append(timesteps_i)
            reward_list.append(cum_reward)
            # actions_one_hot.append(actions_i_one_hot)
            # actions.append(actions_i)

        stats = {
            "states": states,
            "next_states": next_states,
            "rewards": rewards,
            "timesteps": timesteps,
            "reward_list": reward_list,
            "actions_one_hot": actions_one_hot,
            "actions": actions,
            "wins": wins,
            "paths": paths,
            "small_vel": small_vel,
        }
    # print(stats)
    print(np.mean(stats["reward_list"]))
    return stats
Example #20
0
def launch(env_name,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           binding,
           logging,
           version,
           n_cycles,
           note,
           override_params={},
           save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu, binding)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging

    if logging:
        logdir = 'logs/' + str(env_name) + '-replay_strategy' + str(
            replay_strategy) + '-n_epochs' + str(n_epochs) + '-num_cpu' + str(
                num_cpu) + '-seed' + str(seed) + '-n_cycles' + str(
                    n_cycles) + '-version' + str(
                        version) + '-T-' + datetime.datetime.now().strftime(
                            "%Y-%m-%d-%H-%M-%S")
    else:
        logdir = osp.join(
            tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))

    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()  # use temp folder for other rank
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    params['binding'] = binding
    params['max_timesteps'] = n_epochs * params['n_cycles'] * params[
        'n_batches'] * num_cpu
    params['version'] = version
    params['n_cycles'] = n_cycles
    params['num_cpu'] = num_cpu
    params['note'] = note or params['note']
    if note:
        with open('params/' + env_name + '/' + note + '.json', 'r') as file:
            override_params = json.loads(file.read())
            params.update(**override_params)

    if params['load_weight']:
        if type(params['load_weight']) is list:
            params['load_weight'] = params['load_weight'][seed]
        base = os.path.splitext(params['load_weight'])[0]
        policy_weight_file = open(base + '_weight.pkl', 'rb')
        pretrain_weights = pickle.load(policy_weight_file)
        policy_weight_file.close()
    else:
        pretrain_weights = None

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   pretrain_weights=pretrain_weights,
                                   clip_return=clip_return)

    render = False
    if params['collect_video']:
        render = 'rgb_array'

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'render': render,
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          num_cpu=num_cpu,
          collect_data=params['collect_data'],
          collect_video=params['collect_video'],
          goal_generation=params['goal_generation'],
          num_skills=params['num_skills'],
          use_skill_n=params['use_skill_n'],
          batch_size=params['_batch_size'],
          mi_r_scale=params['mi_r_scale'],
          mi_end_epoch=params['mi_end_epoch'],
          sk_r_scale=params['sk_r_scale'],
          no_train_mi=params['no_train_mi'])
def train_trpo(num_timesteps,
               eval_episodes,
               seed,
               horizon,
               out_dir='.',
               load_path=None,
               checkpoint_path_in=None,
               gamma=0.99,
               grid_size=5,
               first_zone=-1.0,
               second_zone=-10.,
               action=2.,
               timesteps_per_batch=500,
               rand_initial=True,
               clip_mean=False,
               direction='border',
               fail_prob=0.1,
               border_width=2.,
               continuous=True,
               n_basis=None,
               num_layers=0,
               num_hidden=32,
               checkpoint_freq=20,
               init_logstd=-1,
               trainable_variance=False,
               trainable_bias=False):
    if n_basis is None:
        #n_basis = np.array([grid_size, 2 * grid_size])
        n_basis = np.array([2 * grid_size, 4 * grid_size])
    start_time = time.time()
    clip = None
    if clip_mean:
        clip = (-5, 5)
    rew_wights = [first_zone, second_zone, action]
    print(rew_wights)
    print(fail_prob)
    print(horizon)
    if continuous:
        dir = 'cont_gridworld'
        env = GridWorldAction(shape=[grid_size, grid_size],
                              rew_weights=rew_wights,
                              horizon=horizon,
                              randomized_initial=rand_initial,
                              fail_prob=fail_prob,
                              border_width=border_width,
                              n_bases=n_basis,
                              direction=direction)
        env_eval = GridWorldAction(shape=[grid_size, grid_size],
                                   rew_weights=rew_wights,
                                   horizon=horizon,
                                   randomized_initial=rand_initial,
                                   fail_prob=fail_prob,
                                   border_width=border_width,
                                   n_bases=n_basis,
                                   direction=direction)
    else:
        dir = 'gridworld'
        env = GridWorld(gamma=gamma,
                        rew_weights=rew_wights,
                        fail_prob=fail_prob,
                        horizon=horizon,
                        shape=(grid_size, grid_size),
                        randomized_initial=rand_initial,
                        direction=direction)
        env_eval = GridWorld(gamma=gamma,
                             rew_weights=rew_wights,
                             fail_prob=fail_prob,
                             horizon=horizon,
                             shape=(grid_size, grid_size),
                             randomized_initial=rand_initial,
                             direction=direction)

    directory_output = (dir + '/trpo-rews-' + str(first_zone) + '_' +
                        str(second_zone) + '_' + str(action)) + '/' + direction

    def eval_policy_closure(**args):
        return eval_and_render_policy(env_eval, **args)

    tf.set_random_seed(seed)
    sess = U.single_threaded_session()
    sess.__enter__()
    rank = MPI.COMM_WORLD.Get_rank()
    time_str = str(start_time)
    if rank == 0:
        logger.configure(dir=out_dir + '/' + directory_output + '/logs',
                         format_strs=['stdout', 'csv'],
                         suffix=time_str)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    network = mlp(num_hidden=num_hidden, num_layers=num_layers)
    trpo_mpi.learn(network=network,
                   env=env,
                   eval_policy=eval_policy_closure,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=0.001,
                   cg_iters=10,
                   cg_damping=1e-3,
                   total_timesteps=num_timesteps,
                   gamma=gamma,
                   lam=1.0,
                   vf_iters=3,
                   vf_stepsize=1e-4,
                   checkpoint_freq=checkpoint_freq,
                   checkpoint_dir_out=out_dir + '/' + directory_output +
                   '/models/' + time_str + '/',
                   load_path=load_path,
                   checkpoint_path_in=checkpoint_path_in,
                   eval_episodes=eval_episodes,
                   init_logstd=init_logstd,
                   trainable_variance=trainable_variance,
                   trainable_bias=trainable_bias,
                   clip=None)
    print('TOTAL TIME:', time.time() - start_time)
    print("Time taken: %f seg" % ((time.time() - start_time)))
    print("Time taken: %f hours" % ((time.time() - start_time) / 3600))

    env.close()
Example #22
0
def run(config):
    sess = U.single_threaded_session(gpu=False)
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    is_chef = (rank == 0)

    workerseed = config.seed + 10000 * rank
    set_global_seeds(workerseed)

    if is_chef:
        logger.configure()
    else:
        logger.set_level(logger.DISABLED)
        config.render = False
        config.record = False

    env_name = config.env
    env = make_env(env_name, config)

    if is_chef and config.is_train:
        with open(osp.join(config.log_dir, "args.txt"), "a") as f:
            f.write("\nEnvironment argument:\n")
            for k in sorted(env.unwrapped._config.keys()):
                f.write("{}: {}\n".format(k, env.unwrapped._config[k]))

    networks = []

    # build models
    if config.hrl:
        assert config.primitive_envs is not None and config.primitive_paths is not None

        logger.info('====== Module list ======')
        num_primitives = len(config.primitive_envs)
        for primitive_env_name, primitive_path in zip(config.primitive_envs,
                                                      config.primitive_paths):
            logger.info('Env: {}, Dir: {}'.format(primitive_env_name,
                                                  primitive_path))

        meta_pi = MetaPolicy(name="%s/meta_pi" % env_name,
                             env=env,
                             ob_env_name=env_name,
                             primitives=config.primitive_envs,
                             config=config)

        meta_oldpi = MetaPolicy(name="%s/meta_oldpi" % env_name,
                                env=env,
                                ob_env_name=env_name,
                                primitives=config.primitive_envs,
                                config=config)

        primitive_pis = [
            PrimitivePolicy(name="%s/pi" % primitive_env_name,
                            env=env,
                            ob_env_name=primitive_env_name,
                            config=config)
            for primitive_env_name in config.primitive_envs
        ]

        trans_pis, trans_oldpis = None, None
        if config.use_trans:
            trans_pis = [
                TransitionPolicy(
                    name="%s/transition_pi" % primitive_env_name,
                    env=env,
                    ob_env_name=env_name
                    if config.trans_include_task_obs else primitive_env_name,
                    num_primitives=num_primitives,
                    trans_term_activation=config.trans_term_activation,
                    config=config)
                for primitive_env_name in config.primitive_envs
            ]
            trans_oldpis = [
                TransitionPolicy(
                    name="%s/transition_oldpi" % primitive_env_name,
                    env=env,
                    ob_env_name=env_name
                    if config.trans_include_task_obs else primitive_env_name,
                    num_primitives=num_primitives,
                    trans_term_activation=config.trans_term_activation,
                    config=config)
                for primitive_env_name in config.primitive_envs
            ]
            networks.extend(trans_pis)
            networks.extend(trans_oldpis)
        networks.append(meta_pi)
        networks.append(meta_oldpi)
        networks.extend(primitive_pis)

        # build proximity_predictor
        proximity_predictors = None
        if config.use_proximity_predictor:
            portion_start = [
                float(v) for v in config.proximity_use_traj_portion_start
            ]
            portion_end = [
                float(v) for v in config.proximity_use_traj_portion_end
            ]
            if len(portion_start) == 1:
                portion_start = portion_start * num_primitives
            if len(portion_end) == 1:
                portion_end = portion_end * num_primitives

            proximity_predictors = [
                ProximityPredictor(
                    name="%s/proximity_predictor" % primitive_env_name,
                    path=path,
                    env=env,
                    ob_env_name=primitive_env_name,  # make env for every primitive
                    use_traj_portion_end=portion_end,
                    use_traj_portion_start=portion_start,
                    is_train=config.is_train,
                    config=config
                ) for primitive_env_name, path, portion_start, portion_end in \
                zip(config.primitive_envs, config.primitive_paths, portion_start, portion_end)]
            networks.extend(proximity_predictors)

        # build trainer
        from rl.trainer import Trainer
        trainer = Trainer(env, meta_pi, meta_oldpi, proximity_predictors,
                          num_primitives, trans_pis, trans_oldpis, config)

        # build rollout
        rollout = rollouts.traj_segment_generator(
            # stochastic=config.is_train, config=config)
            env,
            meta_pi,
            primitive_pis,
            trans_pis,
            stochastic=True,
            config=config,
            proximity_predictors=proximity_predictors,
        )
    else:
        # build vanilla TRPO
        policy = MlpPolicy(env=env,
                           name="%s/pi" % env_name,
                           ob_env_name=env_name,
                           config=config)

        old_policy = MlpPolicy(env=env,
                               name="%s/oldpi" % env_name,
                               ob_env_name=env_name,
                               config=config)
        networks.append(policy)
        networks.append(old_policy)

        # build trainer
        from rl.trainer_rl import RLTrainer
        trainer = RLTrainer(env, policy, old_policy, config)
        # build rollout
        rollout = rollouts.traj_segment_generator_rl(
            # env, policy, stochastic=config.is_train, config=config)
            env,
            policy,
            stochastic=not config.is_collect_state,
            config=config)

    # initialize models
    def load_model(load_model_path, var_list=None):
        if os.path.isdir(load_model_path):
            ckpt_path = tf.train.latest_checkpoint(load_model_path)
        else:
            ckpt_path = load_model_path
        if ckpt_path:
            U.load_state(ckpt_path, var_list)
        return ckpt_path

    if config.load_meta_path is not None:
        var_list = meta_pi.get_variables() + meta_oldpi.get_variables()
        ckpt_path = load_model(config.load_meta_path, var_list)
        logger.info(
            '* Load the meta policy from checkpoint: {}'.format(ckpt_path))

    def tensor_description(var):
        description = '({} [{}])'.format(
            var.dtype.name, 'x'.join([str(size) for size in var.get_shape()]))
        return description

    var_list = []
    for network in networks:
        var_list += network.get_variables()
    if is_chef:
        for var in var_list:
            logger.info('{} {}'.format(var.name, tensor_description(var)))

    if config.load_model_path is not None:
        # Load all the network
        if config.is_train:
            ckpt_path = load_model(config.load_model_path)
            if config.hrl:
                load_buffers(proximity_predictors, ckpt_path)
        else:
            ckpt_path = load_model(config.load_model_path, var_list)
        logger.info(
            '* Load all policies from checkpoint: {}'.format(ckpt_path))
    elif config.is_train:
        ckpt_path = tf.train.latest_checkpoint(config.log_dir)
        if config.hrl:
            if ckpt_path:
                ckpt_path = load_model(ckpt_path)
                load_buffers(proximity_predictors, ckpt_path)
            else:
                # Only load the primitives
                for (primitive_name,
                     primitive_pi) in zip(config.primitive_paths,
                                          primitive_pis):
                    var_list = primitive_pi.get_variables()
                    if var_list:
                        primitive_path = osp.expanduser(
                            osp.join(config.primitive_dir, primitive_name))
                        ckpt_path = load_model(primitive_path, var_list)
                        logger.info("* Load module ({}) from {}".format(
                            primitive_name, ckpt_path))
                    else:
                        logger.info(
                            "* Hard-coded module ({})".format(primitive_name))
            logger.info("Loading modules is done.")
        else:
            if ckpt_path:
                ckpt_path = load_model(ckpt_path)
    else:
        logger.info('[!] Checkpoint for evaluation is not provided.')
        ckpt_path = load_model(config.log_dir, var_list)
        logger.info(
            "* Load all policies from checkpoint: {}".format(ckpt_path))

    if config.is_train:
        trainer.train(rollout)
    else:
        if config.evaluate_proximity_predictor:
            trainer.evaluate_proximity_predictor(var_list)
        else:
            trainer.evaluate(rollout, ckpt_num=ckpt_path.split('/')[-1])

    env.close()
Example #23
0
def train(
    env: ConfMDP,
    policy: Policy,
    model_approximator: ModelApproximator,
    eval_steps: int = 4,
    eval_freq: int = 5,
    n_trajectories: int = 20,
    iteration_number: int = 2000,
    gamma: float = 1,
    render=False,
    checkpoint_file: str = "tf_checkpoint/general/model.ckpt",
    restore_variables: bool = False,
    save_variables: bool = True,
    logdir: str = None,
    log: bool = False,
    omega=5,
    kappa: float = 1e-5,
    training_set_size: int = 500,
    normalize_data: bool = False,
    dual_reg: float = 0.0,
    policy_reg: float = 0.0,
    exact: bool = False,
    num_processes: int = 1,
    load_data: bool = True,
    **kwargs,
):
    """
    Runner for the REMPS algorithm.
    Setup logging, initialize agent, takes care of fitting or loading things.
    Executes the main training loop by managing workers
    :param env: Environment (Conf-MDP)
    :param policy: The agent policy
    :param model_approximator: the approximation of the model or the true model
    :param eval_steps: how many steps in order to perform evaluation
    :param eval_freq: the frequency of evaluation
    :param n_trajectories: number of trajectories to collect
    :param iteration_number: number of iterations of REMPS
    :param gamma: discount factor
    :param render: render or not episodes
    :param checkpoint_file: where to store checkpoints
    :param restore_variables: restore variables or not from checkpoint
    :param save_variables: save variables in checkpoint
    :param logdir: directory containing logs
    :param log: if true the agents logs the actions probability
    :param omega: initial environment parameters
    :param kappa: parameter of remps environment
    :param training_set_size: number of samples contained in the training set
    :param normalize_data: Whether to normalize data from the training set
    :param dual_reg: regularization on the dual
    :param policy_reg: regularization on the policy
    :param exact: whether the model approximation is exact or not
    :param num_processes: number of processing
    :param load_data: whether to load stored data
    :param kwargs:
    :return:
    """

    # setup logging
    writer = tf.summary.FileWriter(logdir)
    logger.configure(dir=logdir, format_strs=["stdout", "csv"])

    # setup agent
    agent = REMPS(
        policy=policy,
        model=model_approximator,
        env=env,
        kappa=kappa,
        projection_type=Projection.STATE_KERNEL,
        use_features=False,
        training_set_size=training_set_size,
        L2_reg_dual=dual_reg,
        L2_reg_loss=policy_reg,
        exact=exact,
    )

    # create parallel samplers
    # Split work among workers
    n_steps = n_trajectories
    nb_episodes_per_worker = n_steps // num_processes

    inputQs = [Queue() for _ in range(num_processes)]
    outputQ = Queue()
    workers = [
        SamplingWorker(
            policy,
            env,
            nb_episodes_per_worker,
            inputQs[i],
            outputQ,
            env.action_space.n,
            env.observation_space_size,
        )
        for i in range(num_processes)
    ]

    # Start the workers
    for w in workers:
        w.start()

    # Collect data for model fitting
    # torcs model fitting needs to be done before the session initialization
    # due to multiprocessing issues
    if not load_data and isinstance(env, Torcs):
        if isinstance(env, Torcs):
            x, y, avg_rew, ret = collect_data(
                env,
                policy=policy,
                total_n_samples=training_set_size,
                n_params=2,
                initial_port=env.port + 1000,
            )
            logger.log(
                f"Data collection terminate. Avg rew: {np.mean(avg_rew)}, Avg ret: {np.mean(ret)}",
                logger.INFO,
            )

    with U.single_threaded_session() as sess:

        # initialization with session
        agent.initialize(sess, writer, omega)

        # to save variables
        saver = tf.train.Saver()

        # initialize all
        if restore_variables:
            # Add ops to save and restore all the variables.
            saver.restore(sess, tf.train.latest_checkpoint(checkpoint_file))
        else:
            init = tf.global_variables_initializer()
            sess.run(init)

        # make sure all variables are initialized
        sess.run(tf.assert_variables_initialized())

        logger.log("Collecting Data", level=logger.INFO)

        if not load_data and not isinstance(env, Torcs):
            x, y = run_env(
                env,
                episode_count=1,
                bins=200,
                omega_max=30,
                omega_min=1,
                n_samples_per_omega=500,
                policy=agent,
                grid=True,
                total_n_samples=training_set_size,
            )

            # store data in the agent
            agent.store_data(x, y, normalize_data)

            logger.log("Data Stored", logger.INFO)

        # fit the model
        agent.fit()

        logger.log("Model fitted", logger.INFO)

        # set configurable parameters
        env.set_params(omega)

        get_parameters = U.GetFlat(agent.get_policy_params())

        # -------------------------------------
        # --------- Training Loop -------------
        # -------------------------------------

        for n in range(iteration_number):
            states = list()
            next_states = list()
            rewards = list()
            actions_one_hot = list()
            actions = list()
            timesteps = list()
            paths = list()

            # statistics
            wins = 0
            small_vel = 0
            traj = 0
            confort_violation = 0
            reward_list = list()
            policy_ws = get_parameters()

            # Run parallel sampling:
            # for each worker send message sample with
            # policy weights and environment parameters
            for i in range(num_processes):
                inputQs[i].put(("sample", policy_ws, omega))

            # Collect results when ready
            with timed("sampling"):
                for i in range(num_processes):
                    _, stats = outputQ.get()
                    states.extend(stats["states"])
                    paths.extend(stats["paths"])
                    next_states.extend(stats["next_states"])
                    rewards.extend(stats["rewards"])
                    actions_one_hot.extend(stats["actions_one_hot"])
                    actions.extend(stats["actions"])
                    timesteps.extend(stats["timesteps"])
                    reward_list.extend(stats["reward_list"])
                    wins += stats["wins"]
                    small_vel += stats["small_vel"]
                    traj += stats["traj"]
                    confort_violation += stats["confort_violation"]

            samples_data = {
                "actions": np.matrix(actions).transpose(),
                "actions_one_hot": np.array(actions_one_hot),
                "observations": states,
                "paths": paths,
                "rewards": np.transpose(np.expand_dims(np.array(rewards), axis=0)),
                "reward_list": reward_list,
                "timesteps": timesteps,
                "wins": (wins / traj) * 100,
                "omega": omega,
                "traj": traj,
                "confort_violation": confort_violation,
            }

            # print statistics
            logger.log(f"Training steps: {n}", logger.INFO)
            logger.log(f"Number of wins: {wins}", logger.INFO)
            logger.log(f"Percentage of wins: {(wins/n_trajectories)*100}", logger.INFO)
            logger.log(f"Average reward: {np.mean(reward_list)}", logger.INFO)
            logger.log(f"Avg timesteps: {np.mean(timesteps)}")

            # learning routine
            with timed("training"):
                omega = agent.train(samples_data)

            # Configure environments with
            # parameters returned by the agent
            env.set_params(omega)

            # Only TORCS: we kill torcs every 10 iterations due to a memory leak
            if n % 10 == 0 and isinstance(env, Torcs):
                print("Killing torcs")
                os.system("ps | grep torcs | awk '{print $1}' | xargs kill -9")

            # -------------------------------------
            # --------- Evaluation ----------------
            # -------------------------------------
            if ((n + 1) % eval_freq) == 0:

                # for plotting
                eval_rewards = []

                # evaluation loop
                for i in range(eval_steps):

                    logger.log("Evaluating...", logger.INFO)
                    state = env.reset()
                    done = False
                    # gamma_cum is gamma^t
                    gamma_cum = 1
                    cum_reward = 0
                    t = 0

                    # here starts an episode
                    while not done:

                        if render:
                            env.render()

                            # sample one action at random
                        action = agent.pi(state[np.newaxis, :], log=log)

                        # observe the next state, reward etc
                        newState, reward, done, info = env.step(action)

                        cum_reward += reward * gamma_cum
                        gamma_cum = gamma * gamma_cum

                        state = newState

                        if done:
                            break

                        t = t + 1

                    eval_rewards.append(cum_reward)

                # save variables
                if save_variables:
                    save_path = saver.save(sess, checkpoint_file)
                    logger.log(f"Steps: {n}", logger.INFO)
                    logger.log(f"Model saved in path: {save_path}", logger.INFO)

        # Close the env
        env.close()

        # save variables
        if save_variables:
            save_path = saver.save(sess, checkpoint_file)
            logger.log(f"Model saved in path: {save_path}")

        # exit workers
        for i in range(num_processes):
            inputQs[i].put(("exit", None, None))
Example #24
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 docker environment
    env = Create2DockerEnv(30,
                           port='/dev/ttyUSB0',
                           ir_window=20,
                           ir_history=1,
                           obs_history=1,
                           dt=0.045,
                           random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_docker,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Example #25
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          restore=False):
    rank = MPI.COMM_WORLD.Get_rank()
    max_action = np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2])
    # min_action = np.array([-0.2, -0.2, -0.2, -0.2, -0.2, -0.2])

    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    model_directory = '/home/zhimin/PycharmProjects/RL_UA/Peg_in_Hole/1-baselines/baselines/ddpg/simulation_data'

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.state_dim,
                 env.action_dim,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 restore=restore)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    saver = tf.train.Saver()
    """Set up logging stuff only for a single worker"""
    # if rank == 0:
    #     saver = tf.train.Saver()
    # else:
    #     saver = None
    # eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:
        """Prepare everything"""
        if restore:
            saver = tf.train.import_meta_graph(model_directory + 'model.meta')
            agent.restore_model(model_directory, saver, sess)
        else:
            agent.initialize(sess)
            sess.graph.finalize()
        """Agent Reset"""
        agent.reset()
        # episode_step = 0
        # episodes = 0
        # t = 0
        """Force calibration"""
        # if env.robot_control.CalibFCforce() is False:
        #     exit()

        delay_rate = np.power(10, 1 / nb_epochs)
        epoch_start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_adaptive_distances = []
        epoch_episodes_discount_reward = []
        epoch_episodes_average_reward = []

        epoch_actions = []
        epoch_qs = []
        Force_moments = []
        epoch_episodes = 0
        Long_term_reward = -0.10
        for epoch in range(nb_epochs):
            """Show the result for cycle 20 times and Save the model"""
            epoch_actor_losses = []
            epoch_critic_losses = []
            """Delay the learning rate"""
            epoch_actor_lr = actor_lr / delay_rate
            epoch_critic_lr = critic_lr / delay_rate

            for cycle in range(nb_epoch_cycles):
                """environment reset """
                agent.reset()
                obs = env.reset()
                episode_reward = 0.
                episode_discount_reward = 0.
                q_value = 0.
                done = False
                forcement = []
                Last_average_reward = 0.
                Number_episodes = 0.
                for t_rollout in range(nb_rollout_steps):
                    """Predict next action"""
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape[0] == env.action_dim

                    q_value += q
                    """scale for execution in env"""
                    new_obs, r, done, info, expert_action = env.step(
                        action, t_rollout)
                    episode_discount_reward += gamma * r
                    """adapt_action_noise"""
                    agent.feed_back_explore(action, expert_action)

                    logger.info("The maximum force:" +
                                str(max(abs(new_obs[0:3]))) +
                                " The maximum moments:" +
                                str(max(abs(new_obs[3:6]))))
                    episode_reward += r

                    delta = r - Long_term_reward
                    # if memory.nb_entries >= batch_size and param_noise is not None:
                    #     agent.feed_back_explore(delta)
                    Number_episodes = gamma + gamma * Number_episodes
                    Last_average_reward = r + gamma * Last_average_reward
                    """Plot the force and moments"""
                    # if render:
                    #     forcement.append(new_obs[0:6])
                    #     # print(forcement)
                    #     Force_moments.append(new_obs[0:6])
                    #     env.plot_force(forcement, t_rollout+1)

                    if epoch == 0 and cycle == 0:
                        forcement.append(new_obs[0:6])
                        Force_moments.append(new_obs[0:6])
                        # env.plot_force(forcement, t_rollout + 1)

                    if epoch == nb_epoch_cycles - 1 and cycle == nb_epoch_cycles - 1:
                        forcement.append(new_obs[0:6])
                        Force_moments.append(new_obs[0:6])
                        # env.plot_force(forcement, t_rollout + 1)

                    epoch_actions.append(action)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs
                    """Episode done and start pull the pegs step by step"""
                    if done:
                        logger.info('Peg-in-hole assembly done!!!')
                        epoch_episode_rewards.append(episode_reward)
                        epoch_episodes_discount_reward.append(
                            Last_average_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(t_rollout)
                        epoch_episodes += 1
                        # pull_done = False
                        # while pull_done is False and info:
                        #     pull_done, pull_safe = env.step_up() #Simulation env
                        #     pull_done, pull_safe = env.pull_up() #True env
                        #
                        # if pull_safe is False:
                        #     logger.info('Pull up the pegs failed for the exceed force!!!')
                        #     exit()
                        break
                    """Episode failed and start pull the pegs step by step"""
                    if info is False:
                        logger.info(
                            'Peg-in-hole assembly failed for the exceed force!!!'
                        )
                        # pull_done = False
                        # while pull_done is False and info:
                        #     pull_done, pull_safe = env.step_up()
                        #     pull_done, pull_safe = env.pull_up()  # True env
                        #
                        # if pull_safe is False:
                        #     logger.info('Peg-in-hole assembly failed for the exceed force!!!')
                        #     exit()

                        break

                Long_term_reward = Last_average_reward / Number_episodes
                epoch_qs.append(q_value)
                env.save_figure('force_moment')
                epoch_episodes_average_reward.append(Long_term_reward)
                agent.feedback_adptive_explore()
                if t_rollout == nb_rollout_steps - 1:
                    logger.info(
                        'Peg-in-hole assembly failed for exceed steps!!!')
                    logger.info('The deepest position'.format(obs[8]))
                """train model for nb_train_steps times"""
                for t_train in range(nb_train_steps):
                    cl, al = agent.train(epoch_actor_lr, epoch_critic_lr)
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()
            """Adapt param noise, if necessary"""
            if memory.nb_entries >= batch_size and param_noise is not None:
                distance = agent.adapt_param_noise()
                epoch_adaptive_distances.append(distance)
            """write the result into the summary"""
            agent.log_scalar("actor_loss", mpi_mean(epoch_actor_losses),
                             epoch_episodes)
            agent.log_scalar("critic_loss", mpi_mean(epoch_critic_losses),
                             epoch_episodes)
            agent.log_scalar("episode_score", mpi_mean(epoch_episode_rewards),
                             epoch_episodes)
            agent.log_scalar("episode_steps", mpi_mean(epoch_episode_steps),
                             epoch_episodes)
            agent.log_scalar("episode_average_reward",
                             mpi_mean(epoch_episodes_average_reward),
                             epoch_episodes)
            agent.log_scalar("episode_discount_score",
                             mpi_mean(epoch_episodes_discount_reward),
                             epoch_episodes)
            """Log stats."""
            epoch_train_duration = time.time() - epoch_start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])
            """Rollout statistics. compute the mean of the total nb_epoch_cycles"""
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
            """Train statistics"""
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)
            """save the model and the result"""
            saver.save(sess, model_directory + 'simulation_model')
            # re_rewards = pd.DataFrame(epoch_episode_rewards)
            # re_rewards.to_csv("re_rewards.csv", sep=',', header=False, index=False)
            re_forcement = pd.DataFrame(Force_moments)
            re_forcement.to_csv(model_directory + 'simulation_forcement',
                                sep=',',
                                header=False,
                                index=False)
            # re_steps = pd.DataFrame(epoch_episode_steps)
            # re_steps.to_csv("re_steps.csv", sep=',', header=False, index=False)
            # nf = pd.read_csv("data.csv", sep=',', header=None)

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
def train_copos(env_id, compatible_policy, num_timesteps, timesteps_per_batch,
                seed, filepath, visualize, n_policy, retrace, trpo,
                entropy_bonus, epsilon, beta):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure(dir=filepath)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed  # + 10000 * MPI.COMM_WORLD.Get_rank()
    if compatible_policy:

        def policy_fn(name, ob_space, ac_space):
            return CompatibleMlpPolicy(name=name,
                                       ob_space=ob_space,
                                       ac_space=ac_space,
                                       hid_size=32,
                                       num_hid_layers=2)
    else:
        assert (trpo)

        def policy_fn(name, ob_space, ac_space):
            return MlpPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             hid_size=32,
                             num_hid_layers=2)

    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(workerseed)

    if beta < 0:
        nr_episodes = num_timesteps // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space)

        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))

    if visualize:
        # Load existing policy and visualize
        copos_mpi.visualize(env,
                            policy_fn,
                            timesteps_per_batch=timesteps_per_batch,
                            epsilon=epsilon,
                            beta=beta,
                            cg_iters=10,
                            cg_damping=0.1,
                            max_timesteps=num_timesteps,
                            gamma=0.99,
                            lam=0.98,
                            entcoeff=entropy_bonus,
                            vf_iters=5,
                            vf_stepsize=1e-3,
                            TRPO=trpo,
                            n_policy=n_policy,
                            policy_type=1,
                            filepath=filepath,
                            session=sess,
                            retrace=retrace)
        env.close()
    else:
        # Train policy and save it
        copos_mpi.learn(env,
                        policy_fn,
                        timesteps_per_batch=timesteps_per_batch,
                        epsilon=epsilon,
                        beta=beta,
                        cg_iters=10,
                        cg_damping=0.1,
                        max_timesteps=num_timesteps,
                        gamma=0.99,
                        lam=0.98,
                        entcoeff=entropy_bonus,
                        vf_iters=5,
                        vf_stepsize=1e-3,
                        TRPO=trpo,
                        n_policy=n_policy,
                        policy_type=1,
                        filepath=filepath,
                        session=sess,
                        retrace=retrace)
        env.close()
        saver = tf.train.Saver()
        saver.save(sess, filepath + "_final")
Example #27
0
def train_copos(env_id, num_timesteps, seed, trial, hist_len, block_high,
                nsteps, method, hid_size, give_state, vf_iters):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    workerseed = seed * 10000

    def policy_fn(name, ob_space, ac_space, ob_name):
        return CompatibleMlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hid_size=hid_size,
                                   num_hid_layers=2,
                                   ob_name=ob_name)

    set_global_seeds(workerseed)
    env = make_control_env(env_id,
                           seed,
                           hist_len=hist_len,
                           block_high=block_high,
                           version0=False,
                           give_state=give_state)
    env.seed(workerseed)

    timesteps_per_batch = nsteps
    beta = -1
    if beta < 0:
        nr_episodes = num_timesteps // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi",
                           env.observation_space,
                           env.action_space,
                           ob_name="tmp_ob")
        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))

    copos_mpi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    epsilon=0.01,
                    beta=beta,
                    cg_iters=10,
                    cg_damping=0.1,
                    method=method,
                    max_timesteps=num_timesteps,
                    gamma=0.99,
                    lam=0.98,
                    vf_iters=vf_iters,
                    vf_stepsize=1e-3,
                    trial=trial,
                    crosskl_coeff=0.01,
                    kl_target=0.01,
                    sess=sess)
    env.close()
from baselines.trpo_mpi import trpo_mpi
import gym
import tensorflow as tf
import argparse
import baselines.common.tf_util as U
from baselines.common import set_global_seeds
from mpi4py import MPI

#parser
parser = argparse.ArgumentParser()
parser.add_argument('--environment', dest='environment', type=str, default='MountainCarContinuous-v0')
parser.add_argument('--num_timesteps', dest='num_timesteps', type=int, default=10000)
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
args = parser.parse_args()

sess = U.single_threaded_session()
sess.__enter__()

rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
    logger.set_level(logger.DISABLED)
workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)

# create the environment
env = gym.make(str(args.environment))
# initial_observation = env.reset()

def policy_fn(name, ob_space, ac_space):
    return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
        hid_size=32, num_hid_layers=2)
Example #29
0
def learn(*,
          network,
          env,
          total_timesteps,
          num_cpu,
          allow_run_as_root,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()
    logger.info('before mpi_fork: rank', rank, 'num_cpu',
                MPI.COMM_WORLD.Get_size())

    if num_cpu > 1:
        if allow_run_as_root:
            whoami = mpi_fork_run_as_root(num_cpu)
        else:
            whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            logger.info('parent exiting with code 0...')
            sys.exit(0)

        U.single_threaded_session().__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    num_cpu = MPI.COMM_WORLD.Get_size()
    logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu)

    override_params = override_params or {}

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    params['rollout_batch_size'] = env.num_envs
    params['num_cpu'] = num_cpu
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
    logger.info("actual total timesteps : {}".format(
        n_epochs * n_cycles * rollout_worker.T *
        rollout_worker.rollout_batch_size))

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 save_interval=save_interval,
                 demo_file=demo_file)
def train(env,
          nb_epochs,
          nb_episodes,
          nb_epoch_cycles,
          episode_length,
          nb_train_steps,
          eval_freq,
          save_freq,
          nb_eval_episodes,
          actor,
          critic,
          memory,
          gamma,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          action_noise,
          param_noise,
          popart,
          clip_norm,
          batch_size,
          reward_scale,
          action_repeat,
          full,
          exclude_centering_frame,
          visualize,
          fail_reward,
          num_processes,
          num_processes_to_wait,
          num_testing_processes,
          learning_session,
          min_buffer_length,
          integrator_accuracy=5e-5,
          max_env_traj=100,
          tau=0.01):
    """
    Parameters
    ----------
    nb_epochs : the number of epochs to train.

    nb_episodes : the number of episodes for each epoch.

    episode_length : the maximum number of steps for each episode.

    gamma : discount factor.

    tau : soft update coefficient.

    clip_norm : clip on the norm of the gradient.
    """

    assert action_repeat > 0
    assert nb_episodes >= num_processes

    # Get params from learning session
    checkpoint_dir = learning_session.checkpoint_dir
    log_dir = learning_session.log_dir
    training_step = learning_session.last_training_step

    # Initialize DDPG agent (target network and replay buffer)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=None,
                 critic_l2_reg=critic_l2_reg,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 training_step=training_step)

    # We need max_action because the NN output layer is a tanh.
    # So we must scale it back.
    max_action = env.action_space.high

    # Build Workers
    events = [Event() for _ in range(num_processes)]
    inputQs = [Queue() for _ in range(num_processes)]
    outputQ = Queue()
    # Split work among workers
    nb_episodes_per_worker = nb_episodes // num_processes

    workers = [
        SamplingWorker(i, actor, critic, episode_length,
                       nb_episodes_per_worker, action_repeat, max_action,
                       gamma, tau, normalize_returns, batch_size,
                       normalize_observations, param_noise, critic_l2_reg,
                       popart, clip_norm, reward_scale, events[i], inputQs[i],
                       outputQ, full, exclude_centering_frame,
                       integrator_accuracy, max_env_traj, visualize,
                       fail_reward) for i in range(num_processes)
    ]

    # Run the Workers
    for w in workers:
        w.start()

    # Create Round Robin tester
    tester = RoundRobinTester(
        num_testing_processes, actor, critic, episode_length, nb_eval_episodes,
        action_repeat, max_action, gamma, tau, normalize_returns, batch_size,
        normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale,
        full, exclude_centering_frame, integrator_accuracy, max_env_traj,
        visualize, fail_reward)

    # Start training loop
    with U.single_threaded_session() as sess:
        agent.initialize(sess)

        writer = tf.summary.FileWriter(log_dir)
        writer.add_graph(sess.graph)

        # Initialize writer and statistics
        stats = EvaluationStatistics(tf_session=sess, tf_writer=writer)

        # setup saver
        saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2)

        get_parameters = U.GetFlat(actor.trainable_vars)

        global_step = 0
        obs = env.reset()
        agent.reset()

        # Processes waiting for a new sampling task
        waiting_indices = [i for i in range(num_processes)]
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # If we have sampling workers waiting, dispatch a sampling job
                if waiting_indices:
                    actor_ws = get_parameters()
                    # Run parallel sampling
                    for i in waiting_indices:
                        inputQs[i].put(('sample', actor_ws))
                        events[i].set()  # Notify worker: sample baby, sample!
                    waiting_indices.clear()

                # Collect results when ready
                for i in range(num_processes_to_wait):
                    process_index, transitions = outputQ.get()
                    waiting_indices.append(process_index)
                    print('Collecting transition samples from Worker {}/{}'.
                          format(i + 1, num_processes_to_wait))
                    for t in transitions:
                        agent.store_transition(*t)

                # try to collect other samples if available
                for i in range(num_processes):
                    try:
                        process_index, transitions = outputQ.get_nowait()
                        if process_index not in waiting_indices:
                            waiting_indices.append(process_index)
                        print('Collecting transition samples from Worker {}'.
                              format(process_index))
                        for t in transitions:
                            agent.store_transition(*t)
                    except queue.Empty:
                        # No sampling ready, keep on training.
                        pass

                # Training phase
                if agent.memory.nb_entries > min_buffer_length:
                    for _ in range(nb_train_steps):
                        critic_loss, actor_loss = agent.train()
                        agent.update_target_net()

                        # Plot statistics
                        stats.add_critic_loss(critic_loss, global_step)
                        stats.add_actor_loss(actor_loss, global_step)
                        global_step += 1

                    # Evaluation phase
                    if cycle % eval_freq == 0:
                        print("Cycle number: ",
                              cycle + epoch * nb_epoch_cycles)
                        print("Sending testing job...")
                        actor_ws = get_parameters()

                        # Send a testing job
                        tester.test(actor_ws, global_step)

                        # Print stats (if any)
                        tester.log_stats(stats, logger)

                    if cycle % save_freq == 0:
                        # Save weights
                        save_path = saver.save(sess,
                                               checkpoint_dir,
                                               global_step=global_step)
                        print("Model saved in path: %s" % save_path)
                        # Dump learning session
                        learning_session.dump(agent.training_step)
                        print("Learning session dumped to: %s" %
                              str(learning_session.session_path))
                else:
                    print("Not enough entry in memory buffer")

        # Stop workers
        for i in range(num_processes):
            inputQs[i].put(('exit', None))
            events[i].set()  # Notify worker: exit!
        tester.close()  # Stop testing workers
        env.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor,
          critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr,
          action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps,
          batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50,
          gamma_reward_shaping=0.1, start_reward_shaping=10000):
    logger.info(sys._getframe().f_code.co_name)


    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()
    max_action = env.action_space.high
    logger.info("scale actions by {} before executing in env".format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise,
                 param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr,
                 enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale)
    logger.info("Using agent with the following configuration:")
    logger.info(str(agent.__dict__.items()))

    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        episode_sample = []
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                for t_rollout in range(nb_eval_steps):
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    episode_sample.append((obs, action, r, new_obs, done))
                    if t <= start_reward_shaping:
                        agent.store_transition(obs, action, r, new_obs, done)


                    if done:
                        if t > start_reward_shaping:
                            logger.info("start reward shaping")
                            reward = r
                            agent.store_transition(obs, action, reward, new_obs, done)
                            # episode_sample.append()
                            for i in range(len(episode_sample) - 1):
                                obs_tmp, action_tmp, rew_tmp, new_obs_tmp, done_tmp = \
                                    episode_sample[len(episode_sample) - i - 1]
                                reward = round(reward * gamma_reward_shaping, 5)
                                reward = reward + rew_tmp
                                agent.store_transition(obs_tmp, action_tmp, reward, new_obs_tmp, done)

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()
                    obs = new_obs
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.


            mpi_size = MPI.COMM_WORLD.Get_size()

            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats["rollout/return"] = np.mean(epoch_episode_rewards)
            combined_stats["rollout/return_history"] = np.mean(episode_rewards_history)
            combined_stats["rollout/episode_steps"] = np.mean(epoch_episode_steps)
            combined_stats["rollout/actions_mean"] = np.mean(epoch_actions)
            combined_stats["rollout/Q_mean"] = np.mean(epoch_qs)
            combined_stats["train/loss_actor"] = np.mean(epoch_actor_losses)
            combined_stats["train/loss_critic"] = np.mean(epoch_critic_losses)
            combined_stats["train/param_noise_distance"] = np.mean(epoch_adaptive_distances)
            combined_stats["total/duration"] = duration
            combined_stats["total/steps_per_second"] = float(t) / float(duration)
            combined_stats["total/episodes"] = episodes
            combined_stats["rollout/episodes"] = epoch_episodes
            combined_stats["rollour/actions_std"] = np.std(epoch_actions)

            if eval_env is not None:
                combined_stats["eval/return"] = eval_episode_rewards
                combined_stats["eval/return_history"] = np.mean(eval_episode_rewards_history)
                combined_stats["eval/Q"] = eval_qs
                combined_stats["eval/episodes"] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise  ValueError("expected scalar, got %s" % x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x)
                                                                    for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

            combined_stats["total/epochs"] = epoch + 1
            combined_stats["total/steps"] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            logger.dump_tabular()
            logger.info("")
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, "get_state"):
                    with open(os.path.join(logdir, "env_state.pkl"), "wb") as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, "get_state"):
                    with open(os.path.join(logdir, "eval_env_state.pkl"), "wb") as f:
                        pickle.dump(eval_env.get_state(), f)
    def run(self):
        """Override Process.run()"""
        # Create environment
        env = create_environment(
            action_repeat=self.action_repeat,
            full=self.full,
            exclude_centering_frame=self.exclude_centering_frame,
            visualize=self.visualize,
            fail_reward=self.fail_reward,
            integrator_accuracy=self.integrator_accuracy)
        nb_actions = env.action_space.shape[-1]

        # keep tracks of the number of trajectory done
        num_traj = 0

        env.seed(os.getpid())
        set_global_seeds(os.getpid())

        # Create OU Noise
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=0.2,
                                                    theta=0.1)

        # Allocate ReplayBuffer
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)

        # Create DPPG agent
        agent = DDPG(self.actor,
                     self.critic,
                     memory,
                     env.observation_space.shape,
                     env.action_space.shape,
                     gamma=self.gamma,
                     tau=self.tau,
                     normalize_returns=self.normalize_returns,
                     normalize_observations=self.normalize_observations,
                     batch_size=self.batch_size,
                     action_noise=action_noise,
                     param_noise=self.param_noise,
                     critic_l2_reg=self.critic_l2_reg,
                     enable_popart=self.popart,
                     clip_norm=self.clip_norm,
                     reward_scale=self.reward_scale)

        # Build the sampling logic fn
        sampling_fn = make_sampling_fn(agent, env, self.episode_length,
                                       self.action_repeat, self.max_action,
                                       self.nb_episodes,
                                       self.action_noise_prob)

        # Start TF session
        with U.single_threaded_session() as sess:
            agent.initialize(sess)
            set_parameters = U.SetFromFlat(self.actor.trainable_vars)
            # Start sampling-worker loop.
            while True:
                # self.event.wait()  # Wait for a new message
                # self.event.clear()  # Upon message receipt, mark as read
                message, actor_ws = self.inputQ.get()  # Pop message
                if message == 'sample':
                    # Set weights
                    set_parameters(actor_ws)
                    # Do sampling
                    transitions = sampling_fn()
                    self.outputQ.put((self.process_index, transitions))

                    # update number of trajectories
                    num_traj += self.nb_episodes

                    # restore environment if needed
                    if num_traj >= self.max_env_traj:
                        env.restore()
                        num_traj = 0

                elif message == 'exit':
                    print('[Worker {}] Exiting...'.format(os.getpid()))
                    env.close()
                    break
Example #33
0
def main(port, id, baud):
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create DXL Reacher1D environment
    env = DxlReacher1DEnv(setup='dxl_gripper_default',
                          dxl_dev_path=port,
                          idn=id,
                          baudrate=baud,
                          obs_history=1,
                          dt=0.04,
                          gripper_dt=0.01,
                          rllab_box=False,
                          episode_length_step=None,
                          episode_length_time=2,
                          max_torque_mag=100,
                          control_type='torque',
                          target_type='position',
                          reset_type='zero',
                          reward_type='linear',
                          use_ctypes_driver=True,
                          random_state=rand_state
                          )

    # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque
    # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly.
    # By default, it does not normalize observations or rewards.
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                         hid_size=32, num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({"write_lock": False,
                                     "episodic_returns": [],
                                     "episodic_lengths": [], })
    # Plotting process
    pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env, policy_fn,
          max_timesteps=50000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback,
          )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Example #34
0
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Example #35
0
def main():
    with U.single_threaded_session() as sess:
        batch_size = 64
        current_noise_type = 'adaptive-param_0.2'
        _, stddev = current_noise_type.split('_')
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        param_noise_adaption_interval = 2
        env = gym.make("Pendulum-v0")

        nb_actions = env.action_space.shape[-1]
        layer_norm = True

        # Configure components.
        memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
        critic = Critic(layer_norm=layer_norm)
        actor = Actor(nb_actions, layer_norm=layer_norm)

        # Seed everything to make things reproducible.
        seed = int(1000000 * np.random.rand())
        logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
        tf.set_random_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        env.seed(seed)

        max_action = env.action_space.high
        logger.info('scaling actions by {} before executing in env'.format(max_action))
        agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                     batch_size=batch_size, param_noise=param_noise)
        logger.info('Using agent with the following configuration:')
        logger.info(str(agent.__dict__.items()))

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()
        agent.reset()
        obs = env.reset()
        for t in itertools.count():
            episode_rewards = []
            done = False
            while not done:
                env.render()

                # Take action and update exploration to the newest value
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                new_obs, rew, done, _ = env.step(max_action * action)

                # Book-keeping.
                agent.store_transition(obs, action, rew, new_obs, done)
                obs = new_obs

                episode_rewards.append(rew)
                if done:
                    agent.reset()
                    obs = env.reset()

            nb_train_steps = 100
            epoch_adaptive_distances = []
            epoch_critic_losses = []
            epoch_actor_losses = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            if t % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1))
                logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses)))
                logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses)))
                logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances)))
                logger.dump_tabular()
Example #36
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        mean_episode_rewards = []
        # mean_100_episode_rewards = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        mean_episode_rewards.append(
                            np.mean(episode_rewards_history))

                        if episodes == 500:
                            print(
                                "epoch_episode_rewards*************************************"
                            )
                            print(epoch_episode_rewards)
                            print(
                                "mean_episode_rewards*************************************"
                            )
                            print(mean_episode_rewards)
                            return

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Example #37
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          save_path=None,
          restore_path=None,
          hindsight_mode=None):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                transitions = []
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    transitions.append((obs, action, r, new_obs, done))
                    #agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # store regular transitions into replay memory
                for (obs, action, r, new_obs, done) in transitions:
                    agent.store_transition(obs, action, r, new_obs, done)

                if hindsight_mode in ['final', 'future']:
                    for (obs, action, r, new_obs,
                         done) in replay_final(transitions, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)

                if hindsight_mode in ['future']:
                    for (obs, action, r, new_obs,
                         done) in replay_future(transitions, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)

                # store hindsight transitions.
                '''for i in range(3):
                    # sample a random point in the trajectory
                    idx = np.random.randint(0, len(transitions))
                    obs, action, r, new_obs, done = transitions[idx]
                    # create a goal from that point
                    goal = env.env.obs_to_goal(new_obs)
                    for (obs, action, r, new_obs, done) in replay_with_goal(transitions[:idx+1], goal, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)
                obs, action, r, new_obs, done = transitions[-1]

                # store a "final" transition.
                goal = env.env.obs_to_goal(new_obs)
                for (obs, action, r, new_obs, done) in replay_with_goal(transitions, goal, env.env):
                    agent.store_transition(obs, action, r, new_obs, done)'''

                # Train.

                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['reward'] = mpi_mean(epoch_episode_rewards)
            # combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
            combined_stats['episode_steps'] = mpi_mean(epoch_episode_steps)
            combined_stats['episodes'] = mpi_sum(epoch_episodes)
            # combined_stats['actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['actions_std'] = mpi_std(epoch_actions)
            combined_stats['Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['policy_loss'] = mpi_mean(epoch_actor_losses)
            combined_stats['value_loss'] = mpi_mean(epoch_critic_losses)
            combined_stats['param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/reward'] = mpi_mean(eval_episode_rewards)
                # combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q_mean'] = mpi_mean(eval_qs)
                # combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))

            # Total statistics.
            # combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            # combined_stats['total/episodes'] = mpi_mean(episodes)
            # combined_stats['total/epochs'] = epoch + 1
            # combined_stats['total/steps'] = t
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Example #38
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
Example #39
0
from baselines.common import set_global_seeds, tf_util as U

from baselines.agent.utility.general_utils import get_ee_points, get_position
from baselines.ppo1.mlp_policy import MlpPolicy
from baselines.common.mpi_fork import mpi_fork
from baselines.trpo_mpi import trpo_mpi

import baselines.common.tf_util as U

env = gym.make('GazeboModularScara3DOF-v3')
initial_observation = env.reset()
print("Initial observation: ", initial_observation)
env.render()
seed = 0

sess = U.single_threaded_session()
sess.__enter__()

rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
    logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)


def policy_fn(name, ob_space, ac_space):
    return MlpPolicy(name=name,
                     ob_space=env.observation_space,
                     ac_space=env.action_space,
                     hid_size=32,
                     num_hid_layers=2)
Example #40
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = ReacherEnv(setup="UR5_default",
                     host=None,
                     dof=2,
                     control_type="velocity",
                     target_type="position",
                     reset_type="zero",
                     reward_type="precision",
                     derivative_type="none",
                     deriv_action_max=5,
                     first_deriv_max=2,
                     accel_max=1.4,
                     speed_max=0.3,
                     speedj_a=1.4,
                     episode_length_time=4.0,
                     episode_length_step=None,
                     actuation_sync_period=1,
                     dt=0.04,
                     run_mode="multiprocess",
                     rllab_box=False,
                     movej_t=2.0,
                     delay=0.0,
                     random_state=rand_state)
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=150000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           fb,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir, fb=fb)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps',
            'sg_regenerate', 'goals_noise_eps', 'goals_random_eps',
            'n_subgoals', 'n_steps_per_subgoal'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Example #42
0
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, resume,
          agentName, logdir, hid_size, num_hid_layers, noisy_nets, clip_param,
          entcoeff, optim_epochs, optim_batchsize, optim_stepsize,
          optim_schedule, desired_kl, gamma, lam, portnum, num_parallel):
    from baselines.ppo1 import mlp_policy, pposgd_parallel
    print("num cpu = " + str(num_cpu))
    if (num_cpu > 1) and (num_parallel > 1):
        print(
            "num_cpu > 1 and num_parallel > 0 can't be used together at the moment!"
        )
        exit(0)

    whoami = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()

    if rank != 0: logger.set_level(logger.DISABLED)
    utils.portnum = portnum + rank
    workerseed = seed + 10000 * rank

    if utils.server_list != "":
        servers = utils.server_list.split(",")
        num_thread = utils.num_thread_list.split(",")
        tmp = 0
        a = 0
        snum = -1
        num_total = 0
        for t in num_thread:
            num_total += int(t)

        for t in num_thread:
            if rank < tmp + int(t):
                snum = a
                break
            tmp += int(t)
            a += 1
        if num_total != num_cpu:
            print("Sum of num_thread_list must be equal to num_cpu")
            quit()
        print("Connect to tcp://" + servers[snum] + ":" + str(utils.portnum))
        utils.server_ip = servers[snum]

    set_global_seeds(workerseed)
    if num_parallel > 1:
        env = CustomParallelEnv(num_parallel)
    else:
        env = gym.make(env_id)
        env.seed(seed)

    if logger.get_dir():
        if num_parallel <= 1:
            env = bench.Monitor(env, osp.join(logger.get_dir(),
                                              "monitor.json"))

    def policy_fn(name, ob_space, ac_space, noisy_nets=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=hid_size,
                                    num_hid_layers=num_hid_layers,
                                    noisy_nets=noisy_nets)

    gym.logger.setLevel(logging.WARN)
    pposgd_parallel.learn(env,
                          policy_fn,
                          max_timesteps=num_timesteps,
                          timesteps_per_batch=timesteps_per_batch,
                          clip_param=clip_param,
                          entcoeff=entcoeff,
                          optim_epochs=optim_epochs,
                          optim_stepsize=optim_stepsize,
                          optim_batchsize=optim_batchsize,
                          schedule=optim_schedule,
                          desired_kl=desired_kl,
                          gamma=gamma,
                          lam=lam,
                          resume=resume,
                          noisy_nets=noisy_nets,
                          agentName=agentName,
                          logdir=logdir,
                          num_parallel=num_parallel,
                          num_cpu=num_cpu)
    if num_parallel <= 1:
        env.close()
Example #43
0
def main(env_name,
         seed,
         run_num,
         data_saving_path,
         batch_size_per_process,
         num_iterations,
         autoencoder_base="./novelty_data/local/autoencoders/"):
    num_processes = MPI.COMM_WORLD.Get_size()
    num_timesteps_per_process = batch_size_per_process
    num_iterations_enforce = num_iterations

    import baselines.common.tf_util as U

    comm = MPI.COMM_WORLD
    mpi_rank = comm.Get_rank()

    tf.reset_default_graph()

    with U.single_threaded_session() as sess:
        autoencoder_list = []
        for i in range(run_num):
            autoencoder_model = load_model(autoencoder_base + env_name +
                                           '_autoencoder_seed_' + str(seed) +
                                           '_run_' + str(i) + '.h5')
            autoencoder_list.append(autoencoder_model)

        U.ALREADY_INITIALIZED.update(set(tf.global_variables()))

        logger.reset()
        # logger.configure(
        #     '../data/ppo_' + enforce_env_name + '_autoencoder_' + str(len(autoencoder_list)) + '_seed=' + str(
        #         seed) + '/' + str(st))

        logger.configure(data_saving_path)

        model = train(sess,
                      env_name,
                      num_timesteps=num_iterations_enforce * num_processes *
                      num_timesteps_per_process,
                      timesteps_per_actor=num_timesteps_per_process,
                      autoencoders=autoencoder_list,
                      seed=seed)

        if mpi_rank == 0:
            env = gym.make(env_name)
            env.env.novel_autoencoders = autoencoder_list
            if hasattr(env.env, 'disableViewer'):
                env.env.disableViewer = False
            env = wrappers.Monitor(env,
                                   logger.get_dir() + '/../results',
                                   force=True)

            obs = env.reset()

            step = 0
            while (True):

                env.render()
                actions = model._act(False, obs)
                obs, _, done, _ = env.step(actions[0][0])
                env.render()
                if done:
                    obs = env.reset()
                if done:
                    print("Visualization is Done")
                    break

                step += 1
Example #44
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)