def main(_): _config = default_config() if _config.use_gpu and len(utility.available_gpus(_config.sess_config)) < 1: raise ValueError('There not available gpus...') if not FLAGS.logdir: env = gym.make('SpaceInvaders-v0') # env = gym.make('Breakout-v0') # env = gym.make('Pong-v0') logdir = os.path.abspath('/tmp/{}-{}'.format( datetime.datetime.now().strftime('%Y%m%dT%H%M%S'), env.spec.id)) else: logdir = str(FLAGS.logdir) env = gym.make(logdir.split('-', 1)[1]) env = gym.wrappers.Monitor(env, logdir + "/gym", force=True) env = wrap_deepmind(env, dim=_config.frame_dim) atari_actions = np.arange(env.action_space.n, dtype=np.int32) # Initialize networks. with tf.variable_scope('q_network'): q_network = ValueFunction(_config, env.observation_space, env.action_space, summaries_dir=logdir) with tf.variable_scope('target'): target = ValueFunction(_config, env.observation_space, env.action_space, q_network) # Epsilon eps = np.linspace(_config.epsilon_start, _config.epsilon_end, _config.epsilon_decay_steps) sess = utility.make_session(_config.sess_config) import time time.sleep(10.)
def main(_): init = tf.global_variables_initializer() env = gym.make('SpaceInvaders-v0') config = utility.AttrDict(default_config()) with tf.Session() as sess: init.run() buffer = initialize_memory(sess, env, config) print(len(buffer))
def test_initialize(self): env = gym.make('SpaceInvaders-v0') _config = tools.AttrDict(default_config()) _structure = functools.partial(_config.network, _config, env.observation_space, env.action_space) _network = tf.make_template('network', _structure) _target = tf.make_template('target', _structure) network = _network() target = _target(network.picked_action) init = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init)
def main(_): env = gym.make('SpaceInvaders-v0') # env = gym.make('Pong-v0') env = wrap_deepmind(env) experiment_dir = os.path.abspath("./experiments2/{}".format(env.spec.id)) atari_actions = np.arange(env.action_space.n, dtype=np.int32) # _config = tools.AttrDict(default_config()) _config = default_config() # Initialize networks. with tf.variable_scope('q_network'): q_network = ValueFunction(_config, env.observation_space, env.action_space, summaries_dir=experiment_dir)
def main(_): env = gym.make('SpaceInvaders-v0') env = wrap_deepmind(env) atari_actions = np.arange(env.action_space.n, dtype=np.int32) _config = tools.AttrDict(default_config()) # Initialize networks. with tf.variable_scope('q_network'): q_network = ValueFunction(_config, env.observation_space, env.action_space) with tf.variable_scope('target'): target = ValueFunction(_config, env.observation_space, env.action_space, q_network) # Initialize global step # Epsilon eps = np.linspace(_config.epsilon_start, _config.epsilon_end, _config.epsilon_decay_steps) sess = make_session() initialize_variables(sess) saver, checkpoint_path = make_saver(sess) # Initialize memory memory = initialize_memory(sess, env, _config) # Initialize policy policy = eps_greedy_policy(q_network, env.action_space.n) total_step = sess.run(tf.train.get_global_step()) print('total_step', total_step) for episode in range(_config.num_episodes): observ = env.reset() observ = atari_preprocess(sess, observ) observ = np.stack([observ] * 4, axis=2) for t in itertools.count(): action_prob = policy( sess, observ, eps[min(total_step, _config.epsilon_decay_steps - 1)]) action = np.random.choice(atari_actions, size=1, p=action_prob)[0] next_observ, reward, terminal, _ = env.step(action) # next_observ = atari_preprocess(sess, next_observ) next_observ = np.concatenate( [observ[..., 1:], next_observ[..., None]], axis=2) memory.append( transition(observ, reward, terminal, next_observ, action)) batch_transition = memory.sample(_config.batch_size) best_actions = q_network.best_action(sess, batch_transition.next_observ) target_values = target.estimate(sess, batch_transition.reward, batch_transition.terminal, batch_transition.next_observ, best_actions) loss = q_network.update_step(sess, batch_transition.observ, batch_transition.action, target_values) print('\r({}/{}) loss: {}'.format(total_step, _config.max_total_step_size, loss), end='', flush=True) if total_step % _config.update_target_estimator_every == 0: print('\nUpdate Target Network...') target.assign(sess) if terminal: break total_step += 1 saver.save(sess, checkpoint_path, global_step=tf.train.get_global_step())
"""Test Schedule""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import matplotlib.pyplot as plt import numpy as np from dqn.configs import default_config from dqn.utility import PiecewiseSchedule _config = default_config() num_iterations = float(_config.max_total_step_size) / 4.0 lr_multiplier = 1.0 _lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) ts = np.arange(0., _config.max_total_step_size) values = [] for t in ts: values.append(_lr_schedule.value(t)) values = np.asarray(values) print(values[0:10]) plt.plot(ts, values)
def main(_): _config = default_config() if _config.use_gpu and len(utility.available_gpus( _config.sess_config)) < 1: raise ValueError('There not available gpus...') env, logdir = utility.make_atari_environment(FLAGS.logdir, _config.frame_dim, _config.use_monitor) atari_actions = np.arange(env.action_space.n, dtype=np.int32) # Initialize networks. with tf.variable_scope('q_network'): q_network = ValueFunction(_config, env.observation_space, env.action_space, summaries_dir=logdir) with tf.variable_scope('target'): target = ValueFunction(_config, env.observation_space, env.action_space, q_network) # Epsilon eps = np.linspace(_config.epsilon_start, _config.epsilon_end, _config.epsilon_decay_steps) policy = EpsGreedy(q_network, env.action_space.n) saver = utility.define_saver(exclude=('.*_temporary.*', )) sess = utility.make_session(_config.sess_config) if _config.use_monitor: evaluaor = Evaluate(sess, env, q_network) utility.initialize_variables(sess, saver, logdir) # Initialize memory memory = initialize_memory(sess, env, _config, q_network) # Initialize policy total_step = sess.run(tf.train.get_global_step()) last_timestep = 0 print('total_step', total_step) for episode in range(_config.num_episodes): filename = os.path.join(logdir, 'model.ckpt') saver.save(sess, filename, global_step=tf.train.get_global_step()) last_rewards = 0. observ = env.reset() for t in itertools.count(): action_prob = policy( sess, observ, eps[min(total_step, _config.epsilon_decay_steps - 1)]) action = np.random.choice(atari_actions, size=1, p=action_prob)[0] next_observ, reward, terminal, _ = env.step(action) memory.append( transition(observ, reward, terminal, next_observ, action)) batch_transition = memory.sample(_config.batch_size) best_actions = q_network.best_action(sess, batch_transition.next_observ) target_values = target.estimate(sess, batch_transition.reward, batch_transition.terminal, batch_transition.next_observ, best_actions) loss = q_network.update_step(sess, batch_transition.observ, batch_transition.action, target_values, total_step) print('\r({}/{}) loss: {}'.format(total_step, _config.max_total_step_size, loss), end='', flush=True) if total_step % _config.update_target_estimator_every == 0: print('\nUpdate Target Network...') target.assign(sess) if terminal: break observ = next_observ total_step += 1 if _config.use_monitor: last_rewards, length_ = evaluaor.evaluate() print('Episode ({}/{}), last return: {}, last timesteps: {:05}'.format( episode, _config.num_episodes, last_rewards, total_step - last_timestep)) last_timestep = total_step