Ejemplo n.º 1
0
def main(_):
  _config = default_config()
  if _config.use_gpu and len(utility.available_gpus(_config.sess_config)) < 1:
    raise ValueError('There not available gpus...')
  
  if not FLAGS.logdir:
    env = gym.make('SpaceInvaders-v0')
    # env = gym.make('Breakout-v0')
    # env = gym.make('Pong-v0')
    logdir = os.path.abspath('/tmp/{}-{}'.format(
      datetime.datetime.now().strftime('%Y%m%dT%H%M%S'), env.spec.id))
  else:
    logdir = str(FLAGS.logdir)
    env = gym.make(logdir.split('-', 1)[1])
  env = gym.wrappers.Monitor(env, logdir + "/gym", force=True)
  env = wrap_deepmind(env, dim=_config.frame_dim)
  
  atari_actions = np.arange(env.action_space.n, dtype=np.int32)
  
  # Initialize networks.
  with tf.variable_scope('q_network'):
    q_network = ValueFunction(_config,
                              env.observation_space,
                              env.action_space,
                              summaries_dir=logdir)
  with tf.variable_scope('target'):
    target = ValueFunction(_config, env.observation_space, env.action_space, q_network)
  # Epsilon
  eps = np.linspace(_config.epsilon_start, _config.epsilon_end, _config.epsilon_decay_steps)
  
  sess = utility.make_session(_config.sess_config)
  
  import time
  time.sleep(10.)
Ejemplo n.º 2
0
def main(_):
    init = tf.global_variables_initializer()
    env = gym.make('SpaceInvaders-v0')
    config = utility.AttrDict(default_config())
    with tf.Session() as sess:
        init.run()
        buffer = initialize_memory(sess, env, config)
        print(len(buffer))
Ejemplo n.º 3
0
    def test_initialize(self):
        env = gym.make('SpaceInvaders-v0')

        _config = tools.AttrDict(default_config())

        _structure = functools.partial(_config.network, _config,
                                       env.observation_space, env.action_space)
        _network = tf.make_template('network', _structure)
        _target = tf.make_template('target', _structure)
        network = _network()
        target = _target(network.picked_action)
        init = tf.global_variables_initializer()
        with self.test_session() as sess:
            sess.run(init)
Ejemplo n.º 4
0
def main(_):
    env = gym.make('SpaceInvaders-v0')
    # env = gym.make('Pong-v0')
    env = wrap_deepmind(env)

    experiment_dir = os.path.abspath("./experiments2/{}".format(env.spec.id))
    atari_actions = np.arange(env.action_space.n, dtype=np.int32)
    # _config = tools.AttrDict(default_config())
    _config = default_config()

    # Initialize networks.
    with tf.variable_scope('q_network'):
        q_network = ValueFunction(_config,
                                  env.observation_space,
                                  env.action_space,
                                  summaries_dir=experiment_dir)
Ejemplo n.º 5
0
def main(_):
    env = gym.make('SpaceInvaders-v0')
    env = wrap_deepmind(env)

    atari_actions = np.arange(env.action_space.n, dtype=np.int32)

    _config = tools.AttrDict(default_config())

    # Initialize networks.
    with tf.variable_scope('q_network'):
        q_network = ValueFunction(_config, env.observation_space,
                                  env.action_space)
    with tf.variable_scope('target'):
        target = ValueFunction(_config, env.observation_space,
                               env.action_space, q_network)
    # Initialize global step
    # Epsilon
    eps = np.linspace(_config.epsilon_start, _config.epsilon_end,
                      _config.epsilon_decay_steps)

    sess = make_session()
    initialize_variables(sess)
    saver, checkpoint_path = make_saver(sess)

    # Initialize memory
    memory = initialize_memory(sess, env, _config)
    # Initialize policy
    policy = eps_greedy_policy(q_network, env.action_space.n)

    total_step = sess.run(tf.train.get_global_step())
    print('total_step', total_step)

    for episode in range(_config.num_episodes):
        observ = env.reset()
        observ = atari_preprocess(sess, observ)
        observ = np.stack([observ] * 4, axis=2)
        for t in itertools.count():
            action_prob = policy(
                sess, observ, eps[min(total_step,
                                      _config.epsilon_decay_steps - 1)])
            action = np.random.choice(atari_actions, size=1, p=action_prob)[0]
            next_observ, reward, terminal, _ = env.step(action)
            # next_observ = atari_preprocess(sess, next_observ)
            next_observ = np.concatenate(
                [observ[..., 1:], next_observ[..., None]], axis=2)
            memory.append(
                transition(observ, reward, terminal, next_observ, action))

            batch_transition = memory.sample(_config.batch_size)
            best_actions = q_network.best_action(sess,
                                                 batch_transition.next_observ)
            target_values = target.estimate(sess, batch_transition.reward,
                                            batch_transition.terminal,
                                            batch_transition.next_observ,
                                            best_actions)

            loss = q_network.update_step(sess, batch_transition.observ,
                                         batch_transition.action,
                                         target_values)
            print('\r({}/{}) loss: {}'.format(total_step,
                                              _config.max_total_step_size,
                                              loss),
                  end='',
                  flush=True)

            if total_step % _config.update_target_estimator_every == 0:
                print('\nUpdate Target Network...')
                target.assign(sess)

            if terminal:
                break

            total_step += 1
        saver.save(sess,
                   checkpoint_path,
                   global_step=tf.train.get_global_step())
Ejemplo n.º 6
0
"""Test Schedule"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib.pyplot as plt
import numpy as np

from dqn.configs import default_config
from dqn.utility import PiecewiseSchedule

_config = default_config()

num_iterations = float(_config.max_total_step_size) / 4.0
lr_multiplier = 1.0
_lr_schedule = PiecewiseSchedule([
    (0, 1e-4 * lr_multiplier),
    (num_iterations / 10, 1e-4 * lr_multiplier),
    (num_iterations / 2, 5e-5 * lr_multiplier),
],
                                 outside_value=5e-5 * lr_multiplier)

ts = np.arange(0., _config.max_total_step_size)
values = []
for t in ts:
    values.append(_lr_schedule.value(t))

values = np.asarray(values)
print(values[0:10])
plt.plot(ts, values)
Ejemplo n.º 7
0
def main(_):
    _config = default_config()
    if _config.use_gpu and len(utility.available_gpus(
            _config.sess_config)) < 1:
        raise ValueError('There not available gpus...')
    env, logdir = utility.make_atari_environment(FLAGS.logdir,
                                                 _config.frame_dim,
                                                 _config.use_monitor)
    atari_actions = np.arange(env.action_space.n, dtype=np.int32)

    # Initialize networks.
    with tf.variable_scope('q_network'):
        q_network = ValueFunction(_config,
                                  env.observation_space,
                                  env.action_space,
                                  summaries_dir=logdir)
    with tf.variable_scope('target'):
        target = ValueFunction(_config, env.observation_space,
                               env.action_space, q_network)
    # Epsilon
    eps = np.linspace(_config.epsilon_start, _config.epsilon_end,
                      _config.epsilon_decay_steps)
    policy = EpsGreedy(q_network, env.action_space.n)

    saver = utility.define_saver(exclude=('.*_temporary.*', ))
    sess = utility.make_session(_config.sess_config)
    if _config.use_monitor:
        evaluaor = Evaluate(sess, env, q_network)
    utility.initialize_variables(sess, saver, logdir)

    # Initialize memory
    memory = initialize_memory(sess, env, _config, q_network)
    # Initialize policy
    total_step = sess.run(tf.train.get_global_step())
    last_timestep = 0
    print('total_step', total_step)

    for episode in range(_config.num_episodes):
        filename = os.path.join(logdir, 'model.ckpt')
        saver.save(sess, filename, global_step=tf.train.get_global_step())
        last_rewards = 0.
        observ = env.reset()
        for t in itertools.count():
            action_prob = policy(
                sess, observ, eps[min(total_step,
                                      _config.epsilon_decay_steps - 1)])
            action = np.random.choice(atari_actions, size=1, p=action_prob)[0]
            next_observ, reward, terminal, _ = env.step(action)
            memory.append(
                transition(observ, reward, terminal, next_observ, action))

            batch_transition = memory.sample(_config.batch_size)
            best_actions = q_network.best_action(sess,
                                                 batch_transition.next_observ)
            target_values = target.estimate(sess, batch_transition.reward,
                                            batch_transition.terminal,
                                            batch_transition.next_observ,
                                            best_actions)

            loss = q_network.update_step(sess, batch_transition.observ,
                                         batch_transition.action,
                                         target_values, total_step)
            print('\r({}/{}) loss: {}'.format(total_step,
                                              _config.max_total_step_size,
                                              loss),
                  end='',
                  flush=True)

            if total_step % _config.update_target_estimator_every == 0:
                print('\nUpdate Target Network...')
                target.assign(sess)

            if terminal:
                break

            observ = next_observ
            total_step += 1
        if _config.use_monitor:
            last_rewards, length_ = evaluaor.evaluate()
        print('Episode ({}/{}), last return: {}, last timesteps: {:05}'.format(
            episode, _config.num_episodes, last_rewards,
            total_step - last_timestep))
        last_timestep = total_step