def main(): state_size = 17 action_size = 4 buffer_size = 1024 batch_size = 32 num_steps = 4096 num_samples = 1024 num_repeat = 10 gym_memory = GymReplayBuffer(buffer_size) memory = ReplayBuffer(state_size, action_size, buffer_size, batch_size, 0) # Make some convenient aliases. n = num_steps ns = state_size na = action_size # Generate random experiences ... states = np.zeros((n, ns), dtype=np.float32) actions = np.random.randint(0, na, n) rewards = np.random.uniform(0, 1, n) next_states = np.zeros((n, ns), dtype=np.float32) dones = np.random.randint(2, size=n, dtype=np.bool) ts=[] ts.append(time.time()) print('Memory') for _ in range(num_repeat): for s0, a, r, s1, d in zip(states, actions, rewards, next_states, dones): memory.add(s0, a, r, s1, d) ts.append(time.time()) for _ in range(num_repeat): for _ in range(num_samples): sample = memory.sample() ts.append(time.time()) print('Gym-Memory') for _ in range(num_repeat): for s0, a, r, s1, d in zip(states, actions, rewards, next_states, dones): gym_memory.add(s0, a, r, s1, d) ts.append(time.time()) for _ in range(num_repeat): for _ in range(num_samples): sample = gym_memory.sample(batch_size) ts.append(time.time()) print('Result') print(np.diff(ts))
def load_demo_buffer(env_name, max_items): env_wrapper = MineCraftWrapper(None) demo_buffer = ReplayBuffer(arglist.replay_buffer_len) data = minerl.data.make(environment=env_name, data_dir="./res") print("#############################################") print("Loading demonstrations") print("#############################################") items = 0 for current_state, action, reward, next_state, done in data.batch_iter( batch_size=1, num_epochs=1, seq_len=500): for step in range(len(reward)): minerl_obs = { 'pov': current_state['pov'][0][step], 'compassAngle': current_state['compassAngle'][0][step] } obs = env_wrapper.minerl_obs_to_obs(minerl_obs) minerl_new_obs = { 'pov': next_state['pov'][0][step], 'compassAngle': next_state['compassAngle'][0][step] } new_obs = env_wrapper.minerl_obs_to_obs(minerl_new_obs) minerl_action = { 'attack': action['attack'][0][step], 'back': action['back'][0][step], 'camera': action['camera'][0][step], 'forward': action['forward'][0][step], 'jump': action['jump'][0][step], 'left': action['left'][0][step], 'right': action['right'][0][step] } action = env_wrapper.minerl_action_to_action(minerl_action) demo_buffer.add(obs, action, reward[0][step], new_obs, float(done[0][step])) items += 1 if items >= max_items: break print("#############################################") print("Finished loading demonstrations") print("#############################################") return demo_buffer
env.state = (np.float32(exp_demo[t][0]),np.float32(exp_demo[t][1]),np.float32(exp_demo[t][2]),np.float32(exp_demo[t][3])) # Take action and update exploration to the newest value action = int(exp_demo[t][5]) # new_obs, rew, done, _ = env.step(action) if exp_demo[t][4] == "True": done = True elif exp_demo[t][4] == "False": done = False # Store transition in the replay buffer. rew = 1.0 obs = np.float32(exp_demo[t][:4]) if(t < 999): new_obs = np.float32(exp_demo[t+1][:4]) else: new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs else: # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs env.render() else: if t < 1000: # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer.
# Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
def main(): with U.make_session(8): env = gym.make("Pendulum-v0") act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): env.render() # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
# Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # print("DONE: ", done," REWARD: ", rew) # Store transition in the replay buffer. ment_obs = [] ment_obs_tp1 = [] ment_act = 0 replay_buffer.add(obs, action, rew, new_obs, float(done), ment_obs, ment_obs_tp1, ment_act) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Capture N samples and save them into a csv file env.render() if len(exp_demo) < N: temp_list = list(obs) # temp_list.append(done) # temp_list.append(action)
def train_dqn(opts, seed=None, lr=1e-3, total_timesteps=500000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, checkpoint_freq=500000, learning_starts=1000, gamma=1.000, target_network_update_freq=3000, load_path=None): """ Runs the main recorder by binding certain discrete actions to keys. """ if os.path.exists(opts.model_dir): print('Path already exists. Remove? y for yes') input_char = getch.getch() if not input_char == 'y': print('Exiting') return shutil.rmtree(opts.model_dir) os.makedirs(opts.model_dir) os.makedirs(os.path.join(opts.model_dir, 'logs')) os.makedirs(os.path.join(opts.model_dir, 'weights')) #env = gym.make('MountainCar-v0') env = gym.make('LunarLander-v2') env._max_episode_steps = 1200 sess = get_session() set_global_seeds(seed) train_writer = tf.summary.FileWriter(os.path.join(opts.model_dir, 'logs'), sess.graph) q_func = build_q_func('mlp') # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) replay_buffer = ReplayBuffer(buffer_size) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in range(total_timesteps): # Take action and update exploration to the newest value env.render() update_eps = exploration.value(t) action = act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: print("Exploration value: {}".format(exploration.value(t))) print("Last 25 episode rewards: {}".format(episode_rewards[-25:])) reward_summary = tf.Summary(value=[ tf.Summary.Value(tag='reward', simple_value=episode_rewards[-1]) ]) train_writer.add_summary(reward_summary, t) obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors, summary = train(obses_t, actions, rewards, obses_tp1, dones, weights) train_writer.add_summary(summary, t) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() if t > learning_starts and t % checkpoint_freq == 0: save_variables( os.path.join(opts.model_dir, 'weights', '{}.model'.format(t))) save_variables(os.path.join(opts.model_dir, 'weights', 'last.model'))
def learn(env, q_func, policy_fn, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) scope = "ampi" reuse=None grad_norm_clipping=None num_actions=env.action_space.n optimizer_q=tf.train.AdamOptimizer(learning_rate=lr) optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr) act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # add ob_space = env.observation_space ac_space = env.action_space pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func") pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # Q_{train}(a,s) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # y_j act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1}) q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1})) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # Regression loss td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # argmax_a Q_{target}(s_j, a) z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a') # classification loss cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=z_j) # Q optimization if grad_norm_clipping is not None: gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients_qq): if grad is not None: gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_q = optimizer_q.apply_gradients(gradients_q) else: optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars) # pi optimization if grad_norm_clipping is not None: gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars) for i, (grad, var) in enumerate(gradients_pi): if grad is not None: gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_pi = optimizer_pi.apply_gradients(gradients_pi) else: optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars) # update_target Q update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_target pi update_target_pi = [] for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name), sorted(target_pi_func_vars, key=lambda v: v.name)): update_target_pi.append(var_target.assign(var)) update_target_pi = tf.group(*update_target_pi) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, cl_error], updates=[optimize_q, optimize_pi] ) update_target = U.function([], [], updates=[update_target_expr, update_target_pi]) q_values = U.function([obs_t_input], q_t) debug = {'q_values': q_values} # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = env.action_space.sample() # not used, just so we have the datatype stochastic=True ac1, vpred1 = act(stochastic, np.array(obs)[None]) action = ac1[0] #action, _ = pi.act(stochastic, obs) #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log train and res mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def learn( env, max_timesteps=50000000, # Human level control hyperparameters batch_size=32, buffer_size=1000000, agent_history_length=4, target_network_update_freq=10000, discount_factor=0.99, # "action_repeat=4" handled by gym environment(equivalent to frame skip) train_freq=4, # agent "update frequency" in human level control paper initial_exploration_rate=1, final_exploration_rate=0.1, final_exploration_frame=1000000, replay_start_size=50000, print_freq=10, checkpoint_freq=100, episode_render_freq=None, log_dir='./tensorboard', start_from_checkpoint=False): writer = tf.summary.FileWriter(log_dir + '/' + env.spec.id) # Linear decay as used in the deepmind paper epsilon = lambda t: max( initial_exploration_rate - (t / final_exploration_frame), final_exploration_rate) preprocess = _preprocess if len( env.observation_space.shape) == 3 else lambda x: x replay_buffer = ReplayBuffer(buffer_size) num_actions = env.action_space.n # Here, we'll use a simple feed forward nn for representing # Q(s) -> [r_1, r_2, ..., r_n] where r_k is the reward for taking action # `k` in state `s` if start_from_checkpoint: model = load_model('tmp_model', custom_objects={'huber_loss': huber_loss}) else: model = q_nn(env.observation_space, num_actions, agent_history_length) target_model = clone_model(model) # Keep some state about the current episode num_episodes = 0 episode_total_reward = 0 episode_timesteps = 0 episode_rewards = [0.0] last_checkpoint_mean_reward = -inf mean_100ep_reward = -inf # Start off with a fresh environment ob = preprocess(env.reset()) obs = [ob for i in range(agent_history_length)] # Play breakout for max_timesteps for t in range(max_timesteps): # With probability epsilon, take a random action if (random.uniform(0, 1) < epsilon(t)): action = env.action_space.sample() else: observations = np.array([obs]) actions = np.reshape(np.ones(num_actions), [1, -1]) q_values = model.predict_on_batch([observations, actions]) action = np.argmax(q_values, axis=1)[0] # Collect observations and store them for replay new_ob, reward, is_done, info = env.step(action) is_done = info['ale.lives'] != 5 new_obs = list(obs) new_obs.pop(0) new_obs.append(preprocess(new_ob)) replay_buffer.add(obs, action, reward, new_obs, is_done) obs = new_obs # Update logging info episode_total_reward += reward episode_timesteps += 1 if t > replay_start_size and t % train_freq == 0: fit_batch(model, target_model, num_actions, discount_factor, replay_buffer.sample(batch_size), writer, t // train_freq) if t > replay_start_size and t % target_network_update_freq == 0: # Must checkpoint model and clear sess to avoid OOM https://github.com/keras-team/keras/issues/5345 model.save('tmp_model') K.clear_session() target_model = load_model( 'tmp_model', custom_objects={'huber_loss': huber_loss}) model = load_model('tmp_model', custom_objects={'huber_loss': huber_loss}) print('Setting model to target model') if is_done: ob = preprocess(env.reset()) obs = np.array([ob for i in range(agent_history_length)]) episode_timesteps = 0 num_episodes += 1 episode_rewards.append(episode_total_reward) episode_total_reward = 0 if len(episode_rewards) > 100: episode_rewards.pop(0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) if is_done and num_episodes % print_freq == 0: print("timesteps", t) print("episodes run", num_episodes) print("last episode reward", episode_rewards[-1]) print("mean_100ep_reward", mean_100ep_reward) print("% time spent exploring", int(100 * epsilon(t))) if t % checkpoint_freq == 0 and mean_100ep_reward > last_checkpoint_mean_reward: print("Saving model due to mean reward increase: ", last_checkpoint_mean_reward, " -> ", mean_100ep_reward) model.save('models/' + env.spec.id + '_deepq.h5py') last_checkpoint_mean_reward = mean_100ep_reward if episode_render_freq is not None and num_episodes % episode_render_freq == 0: env.render()
def learn(self): with U.make_session(8): # Create the environment env = gym.make(self._args.env) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput( env.observation_space, name=name), q_func=self.model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer( learning_rate=self._args.learning_rate), ) # Create the replay buffer replay_buffer = ReplayBuffer(self._args.replay_buffer_size) # Create the schedule for exploration starting from 1 till min_exploration_rate. exploration = LinearSchedule( schedule_timesteps=self._args.exploration_duration, initial_p=1.0, final_p=self._args.min_exploration_rate) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) mean_episode_reward = np.mean(episode_rewards[-101:-1]) # Show learned agent: if mean_episode_reward >= self._render_reward_threshold: env.render() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: self._reward_buffer_mutex.acquire() self._reward_buffer.append(mean_episode_reward) logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(mean_episode_reward, 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() self._reward_buffer_changed = True self._reward_buffer_mutex.release()
def main(): print('main') stats_file = pathlib.Path('stats.csv') if stats_file.exists(): stats_file.unlink() broker = dqn.env.Broker('http://localhost:5000') env = dqn.env.HaliteEnv(broker) with U.make_session(num_cpu=4): observation_shape = env.observation_space.shape def make_obs_ph(name): import dqn.tf_util as U return U.BatchInput(observation_shape, name=name) # Create all the functions necessary to train the model act, train, update_target, debug = dqn.graph.build_train( make_obs_ph=make_obs_ph, q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) act = dqn.play.ActWrapper( act, { 'make_obs_ph': make_obs_ph, 'q_func': model, 'num_actions': env.action_space.n, }) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=30000, initial_p=1.0, final_p=0.03) # Initialize the parameters and copy them to the target network. U.initialize() update_target() learning_starts = 1000 target_network_update_freq = 500 checkpoint_freq = 20 episode_rewards = [0.0] wins = [False] saved_mean_reward = None obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, info = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) wins.append(info['win']) win_rate = round(np.mean(wins[-100:]), 4) is_solved = t > 100 and win_rate >= 99 if is_solved: print('solved') break else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > learning_starts: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) actions = np.argmax(actions, axis=1) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 4) num_episodes = len(episode_rewards) exploration_rate = int(100 * exploration.value(t)) if done: info = { 'date': str(dt.datetime.now()), 'episode': len(episode_rewards), **info, 'win_rate': win_rate, 'mean_100ep_reward': mean_100ep_reward, 'exploration_rate': exploration_rate, } print('episode', info) if not stats_file.exists(): with stats_file.open('w') as fp: fp.write(','.join(info.keys()) + '\n') with stats_file.open('a') as fp: fp.write(','.join(map(str, info.values())) + '\n') if done and num_episodes % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("mean win rate", win_rate) logger.record_tabular("% time spent exploring", exploration_rate) logger.dump_tabular() if done and (t > learning_starts and num_episodes > 100 and num_episodes % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) act.save('dqn_model.pkl') saved_mean_reward = mean_100ep_reward act.save('dqn_model.pkl') env.close()
def startTraining(): # Create the environment print('START ENV', RC.GB_CLIENT_ID(), RC.gbRobotHandle()) env = RobotOperationEnvironment(RC.GB_CLIENT_ID(), RC.GB_CSERVER_ROBOT_ID, RC.gbRobotHandle()) #print('ACTION_SPACE', env.action_space.shape) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() print("Manipulator DEEPQ Training Experiment Start.") for t in itertools.count(): print('Episode ', len(episode_rewards), 'Step ', t, '--------------') print('Start waiting for the next action', env._robot.getOperationState()) while (env._robot.getOperationState() != RC.CROBOT_STATE_READY): time.sleep(0.01) # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] print('Generated action:', action) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result #env.render() pass else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) print('Generated actions:', actions) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
episode_rewards = [0.0] for t in itertools.count(start=1): # Take action and update exploration to the newest value action = act(np.array(student_history)[None], update_eps=exploration.value(t))[ 0] #FIXME: shape (0, ) instead of (None, None) (correct, time_passed), reward, done = s.do_exercise(action) student_history += [ action + correct * s.action_space ] # append observation. Observations are index of exercise + NUM_EXERCISES if it was correct # Store transition in the replay buffer. replay_buffer.add(np.array(student_history[:-1]), action, reward, np.array(student_history), float(done)) episode_rewards[-1] += reward if done: s.reset() student_history = student_history[:1] episode_rewards.append(0) else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically.
class DQNAgent: def __init__(self, identifier, actions, observation_shape, num_steps, x=0.0, y=0.0): self.id = identifier self.actions = actions self.x = x self.y = y self.yellow_steps = 0 self.postponed_action = None self.obs = None self.current_action = None self.weights = np.ones(32) self.td_errors = np.ones(32) self.pre_train = 2500 self.prioritized = False self.prioritized_eps = 1e-4 self.batch_size = 32 self.buffer_size = 30000 self.learning_freq = 500 self.target_update = 5000 # Create all the functions necessary to train the model self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=lambda name: TrafficTfInput(observation_shape, name=name), q_func=dueling_model, num_actions=len(actions), optimizer=tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4), gamma=0.99, double_q=True, scope="deepq" + identifier ) # Create the replay buffer if self.prioritized: self.replay_buffer = PrioritizedReplayBuffer(size=self.buffer_size, alpha=0.6) self.beta_schedule = LinearSchedule(num_steps // 4, initial_p=0.4, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). self.exploration = LinearSchedule(schedule_timesteps=int(num_steps * 0.1), initial_p=1.0, final_p=0.01) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() def take_action(self, t): if self.postponed_action is None: # Take action and update exploration to the newest value action = self.act(np.array(self.obs)[None], update_eps=self.exploration.value(t))[0] else: # Take action postponed by yellow light transition action = self.postponed_action self.postponed_action = None return action def store(self, rew, new_obs, done): # Store transition in the replay buffer. self.replay_buffer.add(self.obs, self.current_action, rew, new_obs, float(done)) def learn(self, t): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > self.pre_train: if self.prioritized: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, self.weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) self.weights = np.ones_like(rewards) # Minimize the error in Bellman's equation and compute TD-error self.td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, self.weights) # Update the priorities in the replay buffer if self.prioritized: new_priorities = np.abs(self.td_errors) + self.prioritized_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) self.update_target_network(t) def update_target_network(self, t): # Update target network periodically. if t % self.target_update == 0: self.update_target() def add_fingerprint_to_obs(self, obs, weights, identifier, td_errors): idx = 0 for w in weights: obs[2, identifier, idx] = w idx += 1 for td in td_errors: obs[2, identifier, idx] = td idx += 1 return obs def add_fingerprint(self, weights, identifier, td_errors): self.obs = self.add_fingerprint_to_obs(self.obs, weights, identifier, td_errors)
class QFPolicy(object): def __init__(self, id, seed, odims, adims, hid_dims, qf_hid_dims, max_pool_size=int(1e6), p_lr = 2e-3, q_lr = 3e-3,te = 1e-2, ): # p_lr = 1e-3, q_lr = 5e-3 self.id = id self.seed = seed self.odims = odims self.adims = adims self.adim = adims[id] self.n = len(odims) self.hid_dims = hid_dims self.qf_hid_dims = qf_hid_dims self.p_lr = p_lr self.q_lr = q_lr self.te = te self.pool = ReplayBuffer(max_pool_size) self._build_graph() self._init_session() def _build_graph(self): self.g = tf.Graph() with self.g.as_default(): self._placeholders() self._build_nn() self._loss_train_op() self.init = tf.global_variables_initializer() def _init_session(self): self.sess = tf.Session(graph=self.g) self.sess.run(self.init) # self.a_v.set_session(self.sess) # self.q_v.set_session(self.sess) # self.a__v.set_session(self.sess) # self.q__v.set_session(self.sess) def _placeholders(self): o_sm_n = [] for i in range(self.n): o_sm_n.append(tf.placeholder(tf.float32, (None, self.odims[i]), "observation"+str(i))) self.o_sm = o_sm_n[self.id] self.o_sm_n = tf.concat(o_sm_n, 1) a_sm_n = [] for i in range(self.n): a_sm_n.append(tf.placeholder(tf.float32, (None, self.adims[i]), "action"+str(i))) self.a_sm_n = tf.concat(a_sm_n, 1) self.a_sm_list = a_sm_n self.q_target_sm = tf.placeholder(tf.float32, [None, 1], name="target") def _build_nn(self): def policy_nn(s, scope, trainable, reuse=False): tf.set_random_seed(self.seed) with tf.variable_scope(scope, reuse=reuse): h = s for i, n in enumerate(self.hid_dims): h = tf.layers.dense(h, n, activation=tf.nn.relu, name="polfc%i"%(i+1), trainable=trainable) # logits=tf.layers.dense(h, self.adim, name='pfinal', trainable=trainable) logits = tf.layers.dense(h, 1, name='pfinal', trainable=trainable) # u = tf.random_uniform(tf.shape(logits)) # action = tf.nn.softmax(logits - tf.log(-tf.log(u)), axis=-1) return logits def qf_nn(s, a, scope, trainable, reuse=False): tf.set_random_seed(self.seed) with tf.variable_scope(scope, reuse=reuse): h = tf.concat([s, a], axis=1, name='input') for i, n in enumerate(self.qf_hid_dims): h = tf.layers.dense(h, n, activation=tf.nn.relu, name="qf%i"%(i+1), trainable=trainable) q = tf.layers.dense(h, 1, name='qfinal', trainable=trainable) return q self.a = policy_nn(self.o_sm, 'p_eval', True) self.q = qf_nn(self.o_sm_n, self.a_sm_n, 'q_eval', True) self.a_sm_list[self.id] = self.a self.a_sm_n_a = tf.concat(self.a_sm_list, 1) self.qa = qf_nn(self.o_sm_n, self.a_sm_n_a, 'q_eval', False, reuse=True) self.a_ = policy_nn(self.o_sm, 'p_target', False) self.q_ = qf_nn(self.o_sm_n, self.a_sm_n, 'q_target', False) self.pe_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='p_eval') self.pt_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='p_target') self.qe_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_eval') self.qt_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_target') # target net replacement self.soft_replace = [[tf.assign(pt, (1-self.te)*pt + self.te*pe), tf.assign(qt, (1-self.te)*qt + self.te*qe)] for pt, pe, qt, qe in zip(self.pt_params, self.pe_params, self.qt_params, self.qe_params)] # self.a_v = TFVariables(self.a) # self.q_v = TFVariables(self.q) # self.a__v = TFVariables(self.a_) # self.q__v = TFVariables(self.q_) self.a_v = self.a self.q_v = self.q self.a__v = self.a_ self.q__v = self.q_ self.saver = tf.train.Saver() def _loss_train_op(self): td_error = tf.reduce_mean(tf.square(self.q - self.q_target_sm)) qf_optimizer = tf.train.AdamOptimizer(self.q_lr) self.q_train_op = qf_optimizer.minimize(td_error, var_list=self.qe_params) # policy_loss = -tf.reduce_mean(self.qa) p_optimizer = tf.train.AdamOptimizer(self.p_lr) self.p_train_op = p_optimizer.minimize(policy_loss, var_list=self.pe_params) # def set_policy_params(self, a_v, a__v): self.a_v.set_flat(a_v) self.a__v.set_flat(a__v) def get_policy_params(self): a_v = self.a_v.get_flat() a__v = self.a__v.get_flat() return a_v, a__v def action(self, obs): if np.ndim(obs) == 1: obs=obs[np.newaxis,:] feed_dict = {self.o_sm: obs} a = self.sess.run(self.a, feed_dict=feed_dict) return a[0] def get_target_action(self, obs): if np.ndim(obs) == 1: obs=obs[np.newaxis,:] feed_dict = {self.o_sm: obs} a = self.sess.run(self.a_, feed_dict=feed_dict) return a def get_target_q(self, obs, act): if np.ndim(obs) == 1: obs=obs[np.newaxis,:] feed_dict = {self.a_sm_n: act, self.o_sm_n: obs} q_ = self.sess.run(self.q_, feed_dict=feed_dict) return q_ def experience(self, obs, act, rew, new_obs, done): # Store transition in the replay buffer. # self.pool.add(obs, act, rew, new_obs, float(done)) self.pool.add(obs, act, rew, new_obs, done) def pq_soft_replace(self): self.sess.run(self.soft_replace) def q_train(self, obs, acts, q_target): feed_dict_q = {self.o_sm_n: obs, self.a_sm_n: acts, self.q_target_sm: q_target,} self.sess.run(self.q_train_op, feed_dict_q) def p_train(self, obs, obs_n, act_n): feed_dict_p = {self.o_sm_n: obs, self.o_sm: obs_n[self.id],} for i in range(len(act_n)): if not i==self.id: feed_dict_p.update({self.a_sm_list[i]: act_n[i]}) self.sess.run(self.p_train_op, feed_dict_p) def save_model(self): checkpoint = '/home/lsq/PycharmProjects/spac/spacbackup/cps' self.saver.save(self.sess, checkpoint) def load_model(self, index): load_dir = '/home/lsq/PycharmProjects/spac/spacbackup/' spac_load_dir = load_dir + '0cps-' + '{}'.format(index) print(spac_load_dir) self.saver.restore(self.sess, spac_load_dir) def close_sess(self): self.sess.close()
class DeepqLearner: def __init__(self, env, q_func, config=DEEPQ_CONFIG, callback=None): self.env = env self.q_func = q_func self.config = config self.callback = callback # Create all the functions necessary to train the model gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config["gpu_memory_fraction"]) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10, param_noise=config["param_noise"]) act_params = { # 'make_obs_ph': make_obs_ph, # 'q_func': q_func, 'num_actions': env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the replay buffer self.config = config self.replay_buffer = None self.beta_schedule = None self.make_replay_buffer() # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.t = 0 self.episode_rewards = [0.0] self.num_episodes = 1 self.saved_mean_reward = None self.saved_episode_num = None self.episode_frames = 0 self.model_file = None self.start_time = 0 self.episode_start_time = 0 def make_replay_buffer(self): if self.config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( self.config["buffer_size"], alpha=self.config["prioritized_replay_alpha"]) if self.config["prioritized_replay_beta_iters"] is None: self.config["prioritized_replay_beta_iters"] = self.config[ "max_timesteps"] self.beta_schedule = LinearSchedule( self.config["prioritized_replay_beta_iters"], initial_p=self.config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.config["buffer_size"]) self.beta_schedule = None def run(self): reset = True obs = self.env.reset() self.start_time = time.time() self.episode_start_time = time.time() with tempfile.TemporaryDirectory() as td: td = self.config["checkpoint_path"] or td self.model_file = os.path.join(td, "model") if tf.train.latest_checkpoint(td) is not None: load_state(self.model_file) logger.log('Loaded model from {}'.format(self.model_file)) for self.t in range(self.config["max_timesteps"]): if self.callback is not None: if self.callback(locals(), globals()): break # Determine next action to take, then take that action and observe results action = self._action(obs, reset) env_action = action new_obs, rew, done, _ = self.env.step(env_action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs # Increment typical values reset = False self.episode_frames += 1 self.episode_rewards[-1] += rew # See if done with episode if done: obs = self._reset() reset = True # Do training and deepq updating as needed if self.t > self.config["learning_starts"]: if self.t % self.config["train_freq"] == 0: self._train() if self.t % self.config["target_network_update_freq"] == 0: self.update_target() def _action(self, obs, reset): # Take action and update exploration to the newest value kwargs = {} if not self.config["param_noise"]: update_eps = self.exploration.value(self.t) # update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - self.exploration.value(self.t) + self.exploration.value(self.t) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True return self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] def _train(self): try: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.config["prioritized_replay"]: experience = self.replay_buffer.sample( self.config["batch_size"], beta=self.beta_schedule.value(self.t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.config["batch_size"]) weights, batch_idxes = np.ones_like(rewards), None # Determine errors td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.config["prioritized_replay"]: new_priorities = np.abs( td_errors) + self.config["prioritized_replay_eps"] self.replay_buffer.update_priorities(batch_idxes, new_priorities) except Exception as e: self.make_replay_buffer() print(e) def _reset(self): self.attempt_print() self.attempt_checkpoint() self.episode_rewards.append(0.0) self.num_episodes += 1 self.episode_frames = 0 self.episode_start_time = time.time() return self.env.reset() def calc_mean_100ep_reward(self): if self.num_episodes <= 1: return None return round(np.mean(self.episode_rewards[-101:-1]), 1) def attempt_print(self): p_freq = self.config["print_freq"] if p_freq is not None and self.num_episodes % p_freq == 0: logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.t))) logger.record_tabular("reward - current", self.episode_rewards[-1]) logger.record_tabular("reward - mean", self.calc_mean_100ep_reward()) logger.record_tabular("reward - saved", self.saved_mean_reward) logger.record_tabular("episode # - current", self.num_episodes) logger.record_tabular("episode # - saved", self.saved_episode_num) logger.record_tabular("steps - total", self.t) logger.record_tabular("steps - episode", self.episode_frames) logger.record_tabular( "time - ep duration", str(time.time() - self.episode_start_time) + "s") logger.record_tabular("time - remaining", self.estimate_time_remaining()) logger.dump_tabular() def estimate_time_remaining(self): duration = time.time() - self.start_time if duration <= 0: return "Unknown" time_remaining = self.t / duration * (self.config["max_timesteps"] - self.t) / 60.0 suffix = "" # Format based on time if time_remaining < MINUTE: suffix = " seconds" elif time_remaining < HOUR: suffix = " minutes" time_remaining = time_remaining / MINUTE elif time_remaining < DAY: suffix = " hours" time_remaining = time_remaining / HOUR else: suffix = " days" time_remaining = time_remaining / DAY # Round remaining time and return time_remaining = round(time_remaining * 100.0) / 100.0 return str(time_remaining) + suffix def attempt_checkpoint(self): # Determine if we're going to checkpoint c_freq = self.config["checkpoint_freq"] if c_freq is not None \ and self.num_episodes > 100 \ and self.t > self.config["learning_starts"] \ and self.num_episodes % c_freq == 0: # Determine if reward is growing mean_100ep_reward = self.calc_mean_100ep_reward() if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward: if self.config["print_freq"] is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(self.saved_mean_reward, mean_100ep_reward)) self.saved_mean_reward = mean_100ep_reward self.saved_episode_num = self.num_episodes save_state(self.model_file) def save(self, save_path): print("Saving model to " + save_path) self.act.save(save_path)
class DQNMoveOnlyAgent(base_agent.BaseAgent): ''' DQN that takes in player_relative features and returns movements ''' def __init__(self, learning_rate=5e-4, # could use linearschedule here as well? gamma=.99, epsilon_max=1.0, epsilon_min=0.001, epsilon_decay_steps=300000, learning_starts=1000, train_freq=100, target_update_freq=5000, max_buffer_size=100000, batch_size=16, prioritized_replay_beta_iters = 300000, # in reality this would be max_steps -- for now just much larger than decay steps training=True, indicate_nonrandom_action=False, prioritized=True, prioritized_alpha = .6, # b=.7, a=.5 for rank-based prioritization prioritized_beta = .4, # "rank-based likely not as good for sparse-reward structures" ... clipping limits outliers save_file='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/network_saves', save_dir='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/ckpts/', ckpt_name='collect_minerals_6-23', summary_path='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/summaries/', buffer_path='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/buffers/buffer_6-23', logdir='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/variable_logs.txt', log=True): super(DQNMoveOnlyAgent, self).__init__() # NN hparams self.learning_rate = learning_rate self.gamma = gamma # agent hparams self.epsilon_max = epsilon_max self.epsilon_min = epsilon_min self.epsilon_decay_steps = epsilon_decay_steps self.learning_starts = learning_starts self.train_freq = train_freq self.target_update_freq = target_update_freq self.indicate_nonrandom_action = indicate_nonrandom_action # not sure exactly self.prioritized = prioritized self.prioritized_alpha = prioritized_alpha self.prioritized_beta = prioritized_beta self.save_file = save_file self.batch_size = batch_size self.log = log # other self.training = training self.max_reward = 0 self.total_reward = 0 self.last_state = None self.last_action = None if self.prioritized: self.buffer_file = buffer_path + '_prioritized.p' self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_beta, final_p=1.0) else: self.buffer_file = buffer_path + '.p' # load and set epsilon if os.path.isfile(self.save_file + '.npy'): self.epsilon, self.initial_step = np.load(self.save_file + '.npy') # can i just use loaded step for epsilon as well? print(f'epsilon loaded: {self.epsilon}') else: self.epsilon = 1.0 self.initial_step = 0 self.epsilons = [self.epsilon] # for saving and loading files if save_dir: self.online_save_dir = save_dir + 'online/' # for use in checkpoints self.target_save_dir = save_dir + 'target/' if ckpt_name: self.ckpt_name = ckpt_name if summary_path: self.online_summary_path = summary_path + 'online/' # for use in TB summaries self.target_summary_path = summary_path + 'target/' if self.log: self.init_logger(logdir) # build network if save_dir and ckpt_name: self.online_save_path = self.online_save_dir + ckpt_name + '.ckpt' self.target_save_path = self.target_save_dir + ckpt_name + '.ckpt' print("Building models...") tf.reset_default_graph() self.online_network = PlayerRelativeCNN(spatial_dims=FEATURE_SCREEN_SIZE, learning_rate=self.learning_rate, save_path=self.online_save_path, summary_path=self.online_summary_path, name='DQN') if self.training: # set up target_net and initialize replay buffer self.target_network = PlayerRelativeCNN(spatial_dims=FEATURE_SCREEN_SIZE, learning_rate=self.learning_rate, save_path = self.target_save_path, summary_path = self.target_summary_path, name='target_network') # initialize tf session config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) print('Initialization complete.') # check for and load networks/buffer if possible if os.path.isfile(self.online_save_path + '.index') and os.path.isfile(self.target_save_path + '.index'): self.online_network.load(self.sess) self.target_network.load(self.sess) # check for buffer to load if os.path.isfile(self.buffer_file): with open(self.buffer_file, 'rb') as f: self.replay_buffer = pickle.load(f) else: if self.prioritized: # alpha = 0 is same as uniform self.replay_buffer = PrioritizedReplayBuffer(max_buffer_size, self.prioritized_alpha) else: self.replay_buffer = ReplayBuffer(max_buffer_size) self.online_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'DQN') self.target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'target_network') self.online_network._init_train_fn(self.online_vars, grad_norm_clipping=10) # what are good values for clip? self.target_network._init_train_fn(self.target_vars, grad_norm_clipping=10) print('online and target models loaded.') self._tf_init() if self.training: self._update_target_network() # do i still need this? else: self._tf_init() def reset(self): ''' reset episode ''' self.episodes += 1 self.reward = 0 if self.training: self.last_state = None self.last_action = None self.episode = self.online_network.global_episode.eval(self.sess) def step(self, obs): ''' If no units selected, selects army. Otherwise, move. ''' self.steps += 1 self.reward = obs.reward state = obs.observation.feature_screen.player_relative if self.reward > self.max_reward: self.max_reward = self.reward # handle terminal steps: if self.training if obs.last(): self._handle_episode_end() else: x, y = self._epsilon_greedy_action(state, available_actions, self.epsilon) # stopped here, not sure how to proceed # update online DQN/target network if appropriate if (self.steps % self.train_freq == 0) and (len(self.replay_buffer) > self.batch_size): self._train_network() if self.steps % self.target_update_freq == 0: self._update_target_network() # add to replay buffer if self.last_state is not None: self.replay_buffer.add(self.last_state, self.last_action, obs.reward, state, 0) self.last_state = state self.last_action = np.ravel_multi_index((x,y), FEATURE_SCREEN_SIZE) else: x, y = self.epsilon_greedy_action(state, available_actions, self.epsilon_min) return FUNCTIONS.Move_screen('now', (x,y)) else:
dqn.online_net.reset_noise() elif args.agent == 'MNFDQN': dqn.online_net.reset_noise() while not done: timestamp += 1 if args.agent == 'BootstrappedDQN': action = dqn.act_single_head(state[None], k) elif args.agent in ['NoisyDQN', 'BayesBackpropDQN', 'MNFDQN']: action = dqn.act(state[None], eval=False) elif args.agent == 'DQN': action = dqn.act_e_greedy(state[None], epsilon=epsilon) next_state, reward, done, _ = env.step(int(action)) # Store the transition in memory replay_buffer.add(state, action, reward, next_state, float(done)) # Move to the next state state = next_state # if timestamp % args.target_update_freq == 0: dqn.update_target_net() if timestamp > args.learning_starts: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( args.batch_size) loss = dqn.learn(obses_t, actions, rewards, obses_tp1, dones) log.add_scalar('loss', loss, timestamp) # if episode % 10 == 0: # visited = []
def learn(self): act = self.act train = self.train update_target = self.update_target env = self.env with self.session.as_default(): replay_buffer = ReplayBuffer(self._replay_buffer_size) exploration = LinearSchedule( schedule_timesteps=self._exploration_schedule_steps, initial_p=self._exploration_initial_prob, final_p=self._exploration_final_prob) tf_util.initialize() update_target() episode_rewards = [0.0] episode_errors = [] episode_rw_errors = [] episode_error_diffs = [] observation = env.reset() cnt = itertools.count() for t in itertools.count(): # print("iter: ", t) # Take action and update exploration to the newest value action = act(observation[None], update_eps=exploration.value(t))[0] new_observation, reward, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(observation, action, reward, new_observation, float(done)) observation = new_observation episode_rewards[-1] += reward if done: episode_errors.append(env.error) episode_rewards.append(0) if self._random_walk_sampling_args is not None: sampling_args = self._random_walk_sampling_args sampling_args.update({"graph": env.graph}) rw_error = random_walk_error(sampling_args) episode_rw_errors.append(rw_error) episode_error_diffs.append(rw_error - env.error) if len(episode_rewards) % 10 == 0: nmse = env.get_current_nmse() logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 3)) logger.record_tabular( "mean episode error", round(np.mean(episode_errors[-101:-1]), 3)) logger.record_tabular("nmse", nmse) logger.record_tabular( "sampling set", [int(v) for v in env.sampling_set]) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) if self._random_walk_sampling_args is not None: logger.record_tabular( "mean random walk error", round(np.mean(episode_rw_errors[-101:-1]), 3)) logger.record_tabular( "mean error diff", round(np.mean(episode_error_diffs[-101:-1]), 3)) logger.dump_tabular() observation = env.reset() # Minimize the Bellman equation error on replay buffer sample batch if t > 1000: (observations_t, actions, rewards, observations_tp1, dones) = replay_buffer.sample(32) train(observations_t, actions, rewards, observations_tp1, dones, np.ones_like(rewards)) if t % 1000 == 0: # Update target network periodically. update_target()
class DeepQ(object): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ def __init__(self, env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, max_episodes=100): self.env = env self.q_func = q_func self.lr = lr self.max_timesteps = max_timesteps self.buffer_size = buffer_size self.exploration_fraction = exploration_fraction self.exploration_final_eps = exploration_final_eps self.train_freq = train_freq self.batch_size = batch_size self.print_freq = print_freq self.checkpoint_freq = checkpoint_freq self.learning_starts = learning_starts self.gamma = gamma self.target_network_update_freq = target_network_update_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.prioritized_replay_eps = prioritized_replay_eps self.param_noise = param_noise self.callback = callback self.max_episodes = max_episodes # Create all the functions necessary to train the model self.sess = tf.Session() self.sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph self.observation_space_shape = env.observation_space.shape def make_obs_ph(self, name): return U.BatchInput(self.observation_space_shape, name=name) def make_build_train(self): # Build act and train networks self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=self.make_obs_ph, q_func=self.q_func, num_actions=self.env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=self.lr), gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise) self.act_params = { 'make_obs_ph': self.make_obs_ph, 'q_func': self.q_func, 'num_actions': self.env.action_space.n, } self.act = ActWrapper(self.act, self.act_params) return 'make_build_train() complete' def initialize(self): # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. # self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * self.max_timesteps), # initial_p=1.0, # final_p=self.exploration_final_eps) self.exploration = ConstantSchedule(self.exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() return 'initialize() complete' def transfer_pretrain(self, transferred_instances, epochs, tr_batch_size, keep_in_replay_buffer=True): """ This is a custom function from University of Toronto group to first pretrain the deepq train network with transferred instances. These instances must be zip([s],[a],[r],[s']) tuples mapped over to the same state and action spaces as the target task environment. No output - just updates parameters of train and target networks. """ # TODO - function that trains self.act and self.train using mapped instances done = False # pack all instances into replay buffer for obs, action, rew, new_obs in transferred_instances: self.replay_buffer.add(obs, action, rew, new_obs, float(done)) for epoch in range(epochs): obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( tr_batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) self.update_target() if keep_in_replay_buffer is not True: self.replay_buffer = ReplayBuffer(self.buffer_size) return 'transfer_pretrain() complete' def task_train(self): self.episode_rewards = [0.0] self.episode_steps = [0.0] self.saved_mean_reward = None obs = self.env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(self.max_timesteps): if self.callback is not None: if self.callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - self.exploration.value(t) + self.exploration.value(t) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs self.episode_rewards[-1] += rew self.episode_steps[-1] += 1 if done: obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_steps.append(0.0) reset = True if t > self.learning_starts and t % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if t > self.learning_starts and t % self.target_network_update_freq == 0: # Update target network periodically. self.update_target() mean_100ep_reward = round( np.mean(self.episode_rewards[-101:-1]), 1) num_episodes = len(self.episode_rewards) if done and self.print_freq is not None and len( self.episode_rewards) % self.print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(t))) logger.dump_tabular() if (self.checkpoint_freq is not None and t > self.learning_starts and num_episodes > 100 and t % self.checkpoint_freq == 0): if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward: if self.print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(self.saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True self.saved_mean_reward = mean_100ep_reward if num_episodes >= self.max_episodes: break if model_saved: if self.print_freq is not None: logger.log("Restored model with mean reward: {}".format( self.saved_mean_reward)) U.load_state(model_file) return self.act, self.episode_rewards, self.episode_steps def get_q_values(self, obs): ''' Input: obs should be a numpy array with shape (?,state_space) Output: returns Q values for each possible action with shape (?,action_space) ''' return self.debug['q_values'](obs)
def train(model_file, game="CartPole-v1"): """Train at a game.""" with tf_util.make_session(8): env = gym.make(game) def make_placeholder(name): """Make a placeholder input.""" return tf_util.BatchInput(env.observation_space.shape, name=name) act_params = { 'make_obs_ph': make_placeholder, 'q_func': model, 'num_actions': env.action_space.n } act, train, update_target, debug = deepq.build_train( **act_params, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) act = ActWrapper(act, act_params) replay_buffer = ReplayBuffer(50000) exploration = LinearSchedule( schedule_timesteps=100000, initial_p=1.0, final_p=0.02 ) tf_util.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if not len(episode_rewards) % 100: env.render() if t > 1000: obses_t, actions, rewards, obses_tp1, dones = ( replay_buffer.sample(32) ) train( obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards) ) if not t % 1000: update_target() if not t % 3000: if model_file: tf_util.save_state(model_file) yield act if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1) ) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(t)) ) logger.dump_tabular()
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, callback=None): sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} update_eps = exploration.value(t) action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def sobolev_learn_episode( env, q_func, lr=5e-4, max_episodes=1000, buffer_size=50000, epsilon=.1, #exploration_fraction=0.1, #exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, alpha=1.0, grad_norm_clipping=10.0): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_sobolev_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=grad_norm_clipping, param_noise=param_noise, alpha=alpha) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer ''' if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) ''' replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None exploration = ConstantSchedule(epsilon) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] episode_lengths = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") e = 0 # num of current episode t = 0 # timestep while e < max_episodes: if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew episode_lengths[-1] += 1 if done: obs = env.reset() episode_rewards.append(0.0) episode_lengths.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) # increment counters t += 1 # increment timestep if done: e += 1 # increment episode if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
def train_model(self, batch_size=32, policy_measure='optimal', convergence_threshold=500, episodes_to_explore=100): self.env._reset(train=True, Oanda=self.Oanda) #Defining the directory format for the tensor models timestamp = datetime.datetime.fromtimestamp( time.time()).strftime('%H%M') self.tensor_dir_template = os.path.join(self.parent_path, timestamp + '_Episode%s.ckpt') #Reset tensor folder self.reset_tensor_folder() steps_per_episode = self.env.sim._end - self.env.sim.current_index total_steps = steps_per_episode * self.train_episodes exploration = LinearSchedule(steps_per_episode * episodes_to_explore, final_p=0.02, initial_p=1.0) replaybuffer = ReplayBuffer(total_steps * 1.2) #Use of parallelism config_proto = tf.ConfigProto(inter_op_parallelism_threads=8, intra_op_parallelism_threads=8) current_top_10s = [ ] #Keep track of top 10 performing models after every episodes with tf.Session(config=config_proto) as sess: sess.run(tf.global_variables_initializer()) self.online_network, self.target_network = update_target_network( sess, self.online_network, self.target_network) saver = tf.train.Saver(max_to_keep=None) t = 0 self.reset_bookkeeping_tools() max_reward = 0 self.best_index = 0 for epi in range(1, self.train_episodes + 1): self.env._reset(train=True, Oanda=self.Oanda) state = self.env.sim.states[0] done = False solved = False action_dict = {0: 0, 1: 0, 2: 0} print("Training Period: %s - %s" % (self.env.sim.date_time[0], self.env.sim.date_time[self.env.sim.train_end_index])) while not done: #Predict action given this observation, with random chance of episilon (Exploration) action = q_act(state, self.online_network, exploration.value(t), self.env, sess) #if we are still holding a trade, as specified by the trade_period if self.env.portfolio.holding_trade: action = 2 action_dict[action] += 1 #Obtain next state and reward with action new_state, reward, done, _ = self.env._step(action) #Store this transition in memory replaybuffer.add(state, action, reward, new_state, float(done)) state = new_state t += 1 if t > 500: #Optimize Online network with SGD self.online_network, self.target_network = mini_batch_training( sess, self.env, self.online_network, self.target_network, replaybuffer, BATCH_SIZE=batch_size) if t % 500 == 0: #Periodically update target network with online network self.online_network, self.target_network = update_target_network( sess, self.online_network, self.target_network) if done: #Boring book-keeping after every episode self.journal_record.append(self.env.portfolio.journal) self.avg_reward_record.append( self.env.portfolio.average_profit_per_trade) self.reward_record.append( self.env.portfolio.total_reward) self.equity_curve_record.append( self.env.portfolio.equity_curve) print( "End of Episode %s, Total Reward is %s, Average Reward is %.3f" % (epi, self.env.portfolio.total_reward, self.env.portfolio.average_profit_per_trade)) print( "Percentage of time spent on exploring (Random Action): %s %%" % (int(100 * exploration.value(t)))) print(action_dict) assert policy_measure in [ 'average', 'highest', 'optimal' ], "policy measure can only be 'average', 'highest', or 'optimal'" if policy_measure == 'average': score = self.avg_reward_record[-1] elif policy_measure == 'highest': score = self.reward_record[-1] elif policy_measure == 'optimal': score = np.abs(self.avg_reward_record[-1] ) * self.reward_record[-1] if any(score > x[1] for x in current_top_10s) or not current_top_10s: #If this score is betterr than any top 10 score OR first episode print("Top 10 Score! Score: %s" % score) episode_path = self.tensor_dir_template % epi saver.save(sess, episode_path) if score > max_reward: #If this is the new best score, do the following max_reward = score self.best_index = epi print("New Maximum Score found! Score: %s" % score) if len(current_top_10s) < 10: #Populate the top 10 array if there aren't enough current_top_10s.append((epi, score)) #Sort in descending order by score current_top_10s = sorted(current_top_10s, key=lambda x: x[1], reverse=True) else: #Find the lowest scoring episode, which is the last element weakling = current_top_10s[-1][0] #Replace the lowest scoring episode with this episode and its score current_top_10s.pop(-1) current_top_10s.append((epi, score)) #Sort in descending order current_top_10s = sorted(current_top_10s, key=lambda x: x[1], reverse=True) print() #Check for Convergence if np.mean( self.reward_record[-51:-1] ) > convergence_threshold and exploration.value( t) < 0.04: solved = True if solved: print("Converged!") self.best_models = current_top_10s print() break
class DQN(BaseAgent): def __init__(self, env, name='default', alg_name='dqn', network_type='mini-mlp', total_timesteps=5e7, batch_size=32, lr=1e-3, gamma=0.99, buffer_size=1e6, final_eps=0.05, exploration_fraction=0.1, training_start=1e5, target_update_freq=1e4, optimizer=tf.train.AdamOptimizer, gradient_clipping=None, reward_clipping=False, tau=1., double_q=False, dueling=False, prioritized_replay=False, prioritized_replay_alpha=0.5, prioritized_replay_beta_init=0.4, prioritized_replay_beta_fraction=1.0, prioritized_replay_eps=1e-6, rolling_reward_mean=20, solved_callback=None, render_training=False, **kwargs): """ Implementation of the Deep Q Learning (DQN) algorithm formulated by Mnih et. al. Contains some well known improvements over the vanilla DQN. Parameters ---------- env: gym.Environment (gym) Environment the agent shall learn from and act on name: str descriptive name of this DQN configuration, e.g. 'atari-breakout' network_type: str which network is from 'networks.py' total_timesteps: int or float number of training timesteps batch_size: int size of minibatch per backprop lr: float learning rate gamma: float discount factor gamma for bellman target buffer_size: int or float maximum number of in replay buffer final_eps: float value to which epsilon is annealed exploration_fraction: float fraction of traing timesteps over which epsilon is annealed training_start: int timestep at which training of the q network begins target_update_freq: int frequency of target network updates (in timesteps) optimizer: tf.Optimizer optimizer class which shall be used such as Adam or RMSprop gradient_clipping: int if not None, gradients are clipped by this value by norm reward_clipping: float rewards will be clipped to this value if not None tau: float interpolation constant for soft update. 1.0 corresponds to a full synchronisation of networks weights, as in the original DQN paper double_q: bool enables Double Q Learning for DQN dueling: bool splits network architecture into advantage and value streams. V(s, a) gets more frequent updates, should stabalize learning prioritized_replay: True use (proportional) prioritized replay prioritized_replay_alpha: float alpha for weighting priorization prioritized_replay_beta_init: float initial value of beta for prioritized replay buffer prioritized_replay_beta_fraction: float fraction of total timesteps to anneal beta to 1.0 prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. rolling_reward_mean: int window of which the rolling mean in the statistics is computed solved_callback: function function which gets as an input the episode rewards as an array and must return a bool. if returned True, the training is considered as done and therefore prematurely interrupted. render_training: bool whether to render the environment while training """ # instance name self.name = name # environment to act on / learn from self.env = env # basic DQN parameters self.total_timesteps = float(total_timesteps) self.buffer_size = int(float(buffer_size)) self.batch_size = batch_size self.final_eps = final_eps self.lr = float(lr) self.gamma = float(gamma) self.exploration_fraction = float(exploration_fraction) self.training_start = int(float(training_start)) self.target_update_freq = int(float(target_update_freq)) # tf.Optimizer self.optimizer = optimizer # minor changes as suggested in some papers self.gradient_clipping = int( gradient_clipping) if gradient_clipping is not None else None self.reward_clipping = int( reward_clipping) if reward_clipping is not None else None # enhancements to DQN published in papers self.tau = float(tau) self.double_q = double_q self.dueling = dueling self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = float(prioritized_replay_alpha) self.prioritized_replay_beta_init = float(prioritized_replay_beta_init) self.prioritized_replay_beta_fraction = float( prioritized_replay_beta_fraction) self.prioritized_replay_eps = float(prioritized_replay_eps) # function to determine whether agent is able to act well enough self.solved_callback = solved_callback # call env.render() each training step self.render_training = render_training # sliding window for reward calc self.rolling_reward_mean = rolling_reward_mean # stores latest measure for best policy, e.g. best mean over last N episodes self.latest_best = 0.0 super().__init__(env, alg_name, name, **kwargs) # calculate timestep where epsilon reaches its final value self.schedule_timesteps = int(self.total_timesteps * self.exploration_fraction) # sanity checks assert 0.0 < self.tau <= 1.0 # env specific parameter self.obs_shape = env.observation_space.shape self.num_actions = env.action_space.n # tf scopes self.Q_SCOPE = 'q_network' self.TARGET_SCOPE = 'target_network' # build Q and target network; using different scopes to distinguish variables for gradient computation self.q_t_in, self.q_t = build_network(self.obs_shape, self.num_actions, network_type=network_type, dueling=self.dueling, scope=self.Q_SCOPE, summaries=True) self.target_tp1_in, self.target_tp1 = build_network( self.obs_shape, self.num_actions, dueling=self.dueling, network_type=network_type, scope=self.TARGET_SCOPE) # double Q learning needs to pass observations t+1 to the q networks for action selection # so we reuse already created q network variables but with different input if self.double_q: self.q_tp1_in, self.q_tp1 = build_network( self.obs_shape, self.num_actions, dueling=self.dueling, network_type=network_type, scope=self.Q_SCOPE, reuse=True) # create replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # list of variables of the different networks. required for copying # Q to target network and excluding target network variables from backprop self.q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Q_SCOPE) self.target_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.TARGET_SCOPE) # placeholders used in loss function self._L_r = tf.placeholder(tf.float32, (None, ), name='loss_rewards') self._L_a = tf.placeholder(tf.int32, (None, ), name='loss_actions') self._L_d = tf.placeholder(tf.float32, (None, ), name='loss_dones') # pointer to td error vector self._td_errors = tf.placeholder(tf.float32, (None, ), name='td_errors') # configure prioritized replay if self.prioritized_replay: self._is_weights = tf.placeholder( tf.float32, (None, ), name='importance_sampling_weights') # schedule for PR beta beta_steps = int(self.total_timesteps * self.prioritized_replay_beta_fraction) self.pr_beta = LinearSchedule( beta_steps, initial_p=prioritized_replay_beta_init, final_p=1.0) # epsilon schedule self.eps = LinearSchedule(self.schedule_timesteps, final_p=final_eps) # init optimizer self.opt = self.optimizer(self.lr) # specify loss function, only include Q network variables for gradient computation self.gradients = self.opt.compute_gradients(self._loss(), var_list=self.q_net_vars) # clip gradients by norm if self.gradient_clipping is not None: for idx, (grad, var) in enumerate(self.gradients): if grad is not None: self.gradients[idx] = (tf.clip_by_norm( grad, self.gradient_clipping), var) # create training op self.train_op = self.opt.apply_gradients(self.gradients) # update_target_fn will be called periodically to copy Q network to target Q network # variable lists are sorted by name to ensure that correct values are copied self.update_target_ops = [] for var_q, var_target in zip( sorted(self.q_net_vars, key=lambda v: v.name), sorted(self.target_net_vars, key=lambda v: v.name)): v_update = var_target.assign(self.tau * var_q + (1 - self.tau) * var_target) self.update_target_ops.append(v_update) self.update_target_ops = tf.group(*self.update_target_ops) # global tf.Session and Graph init self.sess = tf.Session() # init tensorboard, variables and debug self._finalize_init() # sync networks before training self.sess.run(self.update_target_ops) def _setup_tensorboard(self): """ Adds all variables that might help debugging to Tensorboard. At the end, the FileWriter is constructed pointing to the specified directory. """ # more placeholders for summarised variables; along with summaries self.eps_ph = tf.placeholder(tf.float32, (), name='epsilon') self.rew_ph = tf.placeholder(tf.float32, (), name='rolling-reward') scalar_summary('epsilon', self.eps_ph) scalar_summary('reward', self.rew_ph) # display q_values while training for a_i in range(self.num_actions): scalar_summary('QTa_{}'.format(a_i + 1), tf.reduce_mean(self.target_tp1[:, a_i]), scope='Q-Values') scalar_summary('Qa_{}'.format(a_i + 1), tf.reduce_mean(self.q_t[:, a_i]), scope='Q-Values') # plot network weights with tf.variable_scope('weights'): for qv in self.q_net_vars: tf.summary.histogram('{}'.format(qv.name), qv) for tv in self.target_net_vars: tf.summary.histogram('{}'.format(tv.name), tv) # gradient histograms with tf.variable_scope('gradients'): for g in self.gradients: tf.summary.histogram('{}-grad'.format(g[1].name), g[0]) def _loss(self): """ Defines loss as layed out in the original Nature paper """ with tf.variable_scope('loss'): # either use maximum target q or use value from target network while the action is chosen by the q net if self.double_q: act_tp1_idxs = tf.stop_gradient(tf.argmax(self.q_tp1, axis=1)) q_tp1 = tf.reduce_sum( self.target_tp1 * tf.one_hot(act_tp1_idxs, self.num_actions), axis=1) else: q_tp1 = tf.reduce_max(self.target_tp1, axis=1) # bellman target y = self._L_r + (self.gamma * (1.0 - self._L_d) * q_tp1) # select q value of taken action qj = tf.reduce_sum(self.q_t * tf.one_hot(self._L_a, self.num_actions), axis=1) # TD errors self._td_errors = qj - y # apply huber loss loss = tf.losses.huber_loss(y, qj) if self.use_tensorboard: scalar_summary('target', tf.reduce_mean(y)) scalar_summary('huber-loss', tf.reduce_mean(loss)) tf.summary.histogram('selected_Q', qj) # importance sampling weights if self.prioritized_replay: updates = tf.reduce_mean(self._is_weights * loss) else: updates = tf.reduce_mean(loss) return updates def _build_feed_dict(self, obs_t, ac_t, rew_t, obs_tp1, dones, eps, rolling_rew, weights=None): """ Takes minibatch and returns feed dict for a tf.Session based on the algorithms configuration. """ # first, add data required in all DQN configs feed_d = { self.q_t_in: obs_t, self.target_tp1_in: obs_tp1, self._L_r: rew_t, self._L_a: ac_t, self._L_d: dones } # pass obs t+1 to q network if self.double_q: feed_d[self.q_tp1_in] = obs_tp1 # importance sampling weights if self.prioritized_replay: feed_d[self._is_weights] = weights # variables only necessary for TensorBoard visualisation if self.use_tensorboard: feed_d[self.eps_ph] = eps feed_d[self.rew_ph] = rolling_rew return feed_d def learn(self): """ Learns Q function for a given amount of timesteps """ # reset env, store first observation obs_t = self.env.reset() # save all episode rewards episode_reward_series = [[0.0]] episode_rewards = [] self.logger.info( 'Starting Exploration, training will start at step {}.'.format( self.training_start)) for t in tqdm(range(int(self.total_timesteps))): # decide on action either by policy or chose a random one epsilon = self.eps.value(t) _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon]) if _rand: action = self.env.action_space.sample() else: action = np.argmax(self.sess.run(self.q_t, {self.q_t_in: [obs_t]}), axis=1) assert len(action) == 1, 'only one action can be taken!' action = action[0] # act on environment with chosen action obs_tp1, reward, done, _ = self.env.step(action) # clip reward if self.reward_clipping: reward = 1 if reward > 0 else -1 if reward < 0 else 0 # store new transition self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done)) # new observation will be current one in next iteration obs_t = obs_tp1 # append current rewards to episode reward series episode_reward_series[-1].append(reward) if self.render_training: self.env.render() if t == self.training_start: self.logger.info('Training starts now! (t = {})'.format(t)) # final calculations and env reset if done: # calculate total reward episode_rewards.append(np.sum(episode_reward_series[-1])) episode_reward_series.append([0.0]) # reset env to initial state obs_t = self.env.reset() # start training after warmup period if t >= self.training_start: # calculate rolling reward rolling_r = np.mean(episode_rewards[-self.rolling_reward_mean:] ) if len(episode_rewards) > 0 else 0.0 # post episode stuff: printing and saving if done: result_table = [['t', t], ['episode', len(episode_rewards)], ['mean_reward [20]', rolling_r], ['epsilon', epsilon]] print('\n{}'.format(tabulate(result_table))) # if the policy improved, save as new best ... achieving a good reward in one episode # might not be the best metric. continuously achieving good rewards would better if len(episode_rewards) >= 25: mr = np.mean( episode_rewards[-self.rolling_reward_mean:]) if mr >= self.latest_best: self.latest_best = mr self.logger.info( 'Saving new best policy with mean[{}]_r = {} ...' .format(self.rolling_reward_mean, mr)) self._save('best') # save latest policy self._save() # write current values to csv log self.csvlog.write('{}, {}, {}\n'.format( len(episode_rewards), epsilon, episode_rewards[-1])) # sample batch of transitions randomly for training and build feed dictionary # prioritized replay needs a beta and returns weights. if self.prioritized_replay: o_t, a_t, r_t, o_tp1, do, is_ws, batch_idxs = self.replay_buffer.sample( self.batch_size, self.pr_beta.value(t)) feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do, epsilon, rolling_r, weights=is_ws) else: o_t, a_t, r_t, o_tp1, do = self.replay_buffer.sample( self.batch_size) feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do, epsilon, rolling_r) # run training (and summary) operations if self.use_tensorboard: summary, _, td_errors = self.sess.run( [self.merge_op, self.train_op, self._td_errors], feed_dict=feed) self.writer.add_summary(summary, t) else: self.sess.run(self.train_op, feed_dict=feed) # new td errors needed to update buffer weights if self.prioritized_replay: new_prios = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxs, new_prios) # sync target network every C steps if (t - self.training_start) % self.target_update_freq == 0: self.sess.run(self.update_target_ops) if self.solved_callback is not None: if self.solved_callback(episode_rewards): self.logger.info('Solved!') break # total reward of last episode episode_rewards.append(np.sum(episode_reward_series[-1])) # finalize training, e.g. set flags, write done-file self._finalize_training() def run(self, render=True): """ Runs policy on given environment """ if not self.is_trained: self.logger.warning('Trying to run untrained model!') # set necessary parameters to their defaults epsilon = self.final_eps reward = 0.0 obs = self.env.reset() while True: # decide on action either by policy or chose a random one _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon]) if _rand: action = self.env.action_space.sample() else: action = np.argmax(self.sess.run(self.q_t, {self.q_t_in: [obs]}), axis=1) assert len(action) == 1, 'only one action can be taken!' action = action[0] # act on environment with chosen action obs, rew, done, _ = self.env.step(action) reward += rew if render: self.env.render() if done: self.logger.info('Done! Reward {}'.format(reward)) reward = 0.0 obs = self.env.reset()
def train_policy(arglist): with U.single_threaded_session(): # Create the environment if arglist.use_dense_rewards: print("Will use env MineRLNavigateDense-v0") env = gym.make("MineRLNavigateDense-v0") env_name = "MineRLNavigateDense-v0" else: print("Will use env MineRLNavigate-v0") env = gym.make('MineRLNavigate-v0') env_name = "MineRLNavigate-v0" if arglist.force_forward: env = MineCraftWrapperSimplified(env) else: env = MineCraftWrapper(env) if not arglist.use_demonstrations: # Use stack of last 4 frames as obs env = FrameStack(env, 4) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=build_q_func('conv_only', dueling=True), num_actions=env.action_space.n, gamma=0.9, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer(s) (TODO: Use prioritized replay buffer) if arglist.use_demonstrations: replay_buffer = ReplayBuffer(int(arglist.replay_buffer_len / 2)) demo_buffer = load_demo_buffer(env_name, int(arglist.replay_buffer_len / 2)) else: replay_buffer = ReplayBuffer(arglist.replay_buffer_len) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule( schedule_timesteps=arglist.num_exploration_steps * arglist.num_episodes * arglist.max_episode_steps, initial_p=1.0, final_p=arglist.final_epsilon) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] n_episodes = 0 n_steps = 0 obs = env.reset() log_path = "./learning_curves/minerl_" + str(date.today()) + "_" + str( time.time()) + ".dat" log_file = open(log_path, "a") for episode in range(arglist.num_episodes): print("Episode: ", str(episode)) done = False episode_steps = 0 while not done: # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(n_steps))[0] new_obs, rew, done, _ = env.step(action) n_steps += 1 episode_steps += 1 # Break episode if episode_steps > arglist.max_episode_steps: done = True # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs # Store rewards episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) n_episodes += 1 # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if (n_steps > arglist.learning_starts_at_steps) and (n_steps % 4 == 0): obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if arglist.use_demonstrations: if (n_steps < arglist.learning_starts_at_steps) and ( n_steps % 4 == 0): obses_t, actions, rewards, obses_tp1, dones = demo_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if (n_steps > arglist.learning_starts_at_steps) and ( n_steps % 4 == 0): obses_t, actions, rewards, obses_tp1, dones = demo_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if n_steps % arglist.target_net_update_freq == 0: update_target() # Log data for analysis if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", n_steps) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(n_steps))) logger.dump_tabular() #TODO: Save checkpoints if n_steps % arglist.checkpoint_rate == 0: checkpoint_path = "./checkpoints/minerl_" + str( episode) + "_" + str(date.today()) + "_" + str( time.time()) + ".pkl" save_variables(checkpoint_path) print("%s,%s,%s,%s" % (n_steps, episode, round(np.mean(episode_rewards[-101:-1]), 1), int(100 * exploration.value(n_steps))), file=log_file) log_file.close()
def main(): MAX_BUFFER_SIZE = 100000 MAX_EPISODES = 10000 TRAIN_EPISODE = 100 TARGET_UPDATE_EPS = 1000 batch_size = 32 n_size = 84 discount = 0.99 checkpoint_dir = './checkpoints' save_file_name = 'mario_weight_2.ckpt' # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha) replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE) sess = tf.Session() mainDQN = DQN(sess, name="main") targetDQN = DQN(sess, name="target") dqn_var_list = targetDQN.var_list sess.run(tf.global_variables_initializer()) copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) saver = tf.train.Saver(var_list=dqn_var_list) for eps in range(MAX_EPISODES): # decaying epsilon greedy e = 1. / ((eps / 10) + 1) done = False step_count = 0 state = env.reset() state_queue = deque(maxlen=4) next_state_queue = deque(maxlen=4) state_queue.append(state) next_state_queue.append(state) prev_100 = 0 curr_100 = 0 while not done: step_count += 1 # cumulate 4 frames if step_count < 4: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state_queue.append(next_state) next_state_queue.append(next_state) continue # training starts if np.random.rand() < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network action = np.argmax( mainDQN.predict( np.reshape(np.array(state_queue), [1, n_size, n_size, 4]))) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # Penalty reward = -100 curr_100 += reward next_state_queue.append(next_state) replay_buffer.add(np.array(state_queue), action, reward, np.array(next_state_queue), done) if step_count % TRAIN_EPISODE == 0: states, actions, rewards, next_states, _ = replay_buffer.sample( batch_size) states, next_states = np.reshape( states, [batch_size, n_size, n_size, 4]), np.reshape( next_states, [batch_size, n_size, n_size, 4]) Q_t = targetDQN.predict(next_states) Q_m = mainDQN.predict(states) Q_t = np.max(Q_t, axis=1) estimates = rewards + discount * Q_t Q_m[np.arange(batch_size), actions] = estimates loss = mainDQN.update(states, Q_m) print("eps: {} step: {} loss: {}".format( eps, step_count, loss)) if curr_100 > prev_100: save_path = saver.save( sess, os.path.join(checkpoint_dir, save_file_name)) print("Model saved in file: %s" % save_path) prev_100 = curr_100 curr_100 = 0 if step_count % TARGET_UPDATE_EPS == 0: sess.run(copy_ops) state_queue.append(next_state)
# Take action and store transition in the replay buffer. if num_iters <= args.learning_starts: action = random.randrange(num_actions) else: # Reshape state to (1, channels, x_dim, y_dim) action = agent.act(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), eval=False) # import pdb # pdb.set_trace() new_obs, rew, done, info = env.step(action) death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) prev_lives = info['ale.lives'] replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) obs = new_obs episode_rewards[-1] += rew if done: log.add_scalar('reward', episode_rewards[-1], num_iters) episode_rewards.append(0.0) obs = env.reset() num_episodes += 1 if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( args.batch_size) # Reshape state to (batch, channels, x_dim, y_dim) obses_t = np.transpose(obses_t, [0, 3, 1, 2])
class MemBufferThread(threading.Thread): # 注意可变参数概念 def __init__(self, mem_queue, max_timesteps=1000000, buffer_size=50000, batch_size=32, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): threading.Thread.__init__(self) self.mem_queue = mem_queue self.prioritized_replay = prioritized_replay self.batch_size = batch_size self.batch_idxes = None self.prioritized_replay_eps = prioritized_replay_eps # Create the replay buffer if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) self.beta_schedule = None def __len__(self): return self.replay_buffer.__len__() def sample(self, t): if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t)) # 这个t的取值有待商议, (obses_t, actions, rewards, obses_tp1, dones, weights, self.batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) # np.ones_like() : Return an array of ones with the same shape and type as a given array. weights, self.batch_idxes = np.ones_like(rewards), None return obses_t, actions, rewards, obses_tp1, dones, weights def update_priorities(self, td_errors): new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(self.batch_idxes, new_priorities) def run(self): # flag = 1 while True: if self.mem_queue.full() is True: print("the mem_queue is full") # if self.replay_buffer.__len__() >= 100000 and self.replay_buffer.__len__() % 100 == 0: # bool(flag): # # print("replay_buffer is 100000 !") # print('') # flag = 0 if self.mem_queue.empty() is not True: single_mem = self.mem_queue.get() self.replay_buffer.add(single_mem[0], single_mem[1], single_mem[2], single_mem[3], single_mem[4])
class DQNLearningAgent(Agent): def __init__( self, env, # observation_space, # action_space, network=None, scope='deepq', seed=None, lr=None, # Was 5e-4 lr_mc=5e-4, total_episodes=None, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=None, # was 0.02 train_freq=1, train_log_freq=100, batch_size=32, print_freq=100, checkpoint_freq=10000, # checkpoint_path=None, learning_starts=1000, gamma=None, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, save_path=None, load_path=None, save_reward_threshold=None, **network_kwargs): super().__init__(env, seed) if train_log_freq % train_freq != 0: raise ValueError( 'Train log frequency should be a multiple of train frequency') elif checkpoint_freq % train_log_freq != 0: raise ValueError( 'Checkpoint freq should be a multiple of train log frequency, or model saving will not be logged properly' ) print('init dqnlearningagent') self.train_log_freq = train_log_freq self.scope = scope self.learning_starts = learning_starts self.save_reward_threshold = save_reward_threshold self.batch_size = batch_size self.train_freq = train_freq self.total_episodes = total_episodes self.total_timesteps = total_timesteps # TODO: scope not doing anything. if network is None and 'lunar' in env.unwrapped.spec.id.lower(): if lr is None: lr = 1e-3 if exploration_final_eps is None: exploration_final_eps = 0.02 #exploration_fraction = 0.1 #exploration_final_eps = 0.02 target_network_update_freq = 1500 #print_freq = 100 # num_cpu = 5 if gamma is None: gamma = 0.99 network = 'mlp' network_kwargs = { 'num_layers': 2, 'num_hidden': 64, } self.target_network_update_freq = target_network_update_freq self.gamma = gamma get_session() # set_global_seeds(seed) # TODO: Check whether below is ok to substitue for set_global_seeds. try: import tensorflow as tf tf.set_random_seed(seed) except ImportError: pass self.q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.train_mc, self.update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=self.q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), optimizer_mc=tf.train.AdamOptimizer(learning_rate=lr_mc), gamma=gamma, grad_norm_clipping=10, param_noise=False, scope=scope, # reuse=reuse, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': self.q_func, 'num_actions': env.action_space.n, } self._act = ActWrapper(act, act_params) self.print_freq = print_freq self.checkpoint_freq = checkpoint_freq # Create the replay buffer self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha, ) if prioritized_replay_beta_iters is None: if total_episodes is not None: raise NotImplementedError( 'Need to check how to set exploration based on episodes' ) prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0, ) else: self.replay_buffer = ReplayBuffer(buffer_size) self.replay_buffer_mc = ReplayBuffer(buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int( exploration_fraction * total_timesteps if total_episodes is None else total_episodes), initial_p=1.0, final_p=exploration_final_eps, ) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.episode_lengths = [0] self.episode_rewards = [0.0] self.discounted_episode_rewards = [0.0] self.start_values = [None] self.lunar_crashes = [0] self.lunar_goals = [0] self.saved_mean_reward = None self.td = None if save_path is None: self.td = tempfile.mkdtemp() outdir = self.td self.model_file = os.path.join(outdir, "model") else: outdir = os.path.dirname(save_path) os.makedirs(outdir, exist_ok=True) self.model_file = save_path print('DQN agent saving to:', self.model_file) self.model_saved = False if tf.train.latest_checkpoint(outdir) is not None: # TODO: Check scope addition load_variables(self.model_file, scope=self.scope) # load_variables(self.model_file) logger.log('Loaded model from {}'.format(self.model_file)) self.model_saved = True raise Exception('Check that we want to load previous model') elif load_path is not None: # TODO: Check scope addition load_variables(load_path, scope=self.scope) # load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) self.train_log_file = None if save_path and load_path is None: self.train_log_file = self.model_file + '.log.csv' with open(self.train_log_file, 'w') as f: cols = [ 'episode', 't', 'td_max', 'td_mean', '100ep_r_mean', '100ep_r_mean_discounted', '100ep_v_mean', '100ep_n_crashes_mean', '100ep_n_goals_mean', 'saved_model', 'smoothing', ] f.write(','.join(cols) + '\n') self.training_episode = 0 self.t = 0 self.episode_t = 0 """ n = observation_space.n m = action_space.n self.Q = np.zeros((n, m)) self._lr_schedule = lr_schedule self._eps_schedule = eps_schedule self._boltzmann_schedule = boltzmann_schedule """ # Make placeholder for Q values self.q_values = debug['q_values'] def _log_training_details( self, episode=None, t=None, td_max=None, td_mean=None, r_mean=None, r_mean_discounted=None, v_mean=None, n_crashes_mean=None, n_goals_mean=None, saved_model=False, smoothing=False, ): if self.train_log_file is not None: with open(self.train_log_file, 'a+') as f: f.write('{}\n'.format(','.join([ str(episode), str(t), '{:.5f}'.format(td_max) if td_max is not None else '', '{:.5f}'.format(td_mean) if td_mean is not None else '', '{:.1f}'.format(r_mean) if r_mean is not None else '', '{:.1f}'.format(r_mean_discounted) if r_mean_discounted is not None else '', '{:.1f}'.format(v_mean) if v_mean is not None else '', '{:.1f}'.format(n_crashes_mean) if n_crashes_mean is not None else '', '{:.1f}'.format(n_goals_mean) if n_goals_mean is not None else '', str(int(saved_model)), str(int(smoothing)), ]))) def get_q_values(self, s): return self.q_values(s)[0] """ q_t = self.q_func( self.obs_t_input.get(), self.n_actions, scope='q_func', reuse=True, # reuse parameters from act ) Q = sess.run( Q_values, feed_dict={Q_obs: np.array(states)} ) raise NotImplementedError """ def act(self, s, explore, explore_eps=None): # Take action and update exploration to the newest value # get_session() obs = s if explore and explore_eps is None: update_eps = self.exploration.value( self.t if self.total_episodes is None else self. training_episode) elif explore: update_eps = explore_eps else: update_eps = 0 return self._act( np.array(obs)[None], update_eps=update_eps, )[0] def smooth( self, behavior_policy, evaluation_timesteps, max_k_random_actions=50, ): """Sample episodes to use for monte-carlo rollouts.""" obs = self.env.reset() ep = 0 episode_rewards = [] episode_states = [] episode_actions = [] # TODO: Don't hard-code, and bias towards smaller. def get_random_k_t(): k_random = self.np_random.randint(0, max_k_random_actions) random_t = self.np_random.randint(k_random, 200) return k_random, random_t k_random_actions, random_t = get_random_k_t() for t in range(evaluation_timesteps): episode_t = len(episode_actions) if IS_LOCAL and episode_t >= random_t: self.env.render() if episode_t < k_random_actions or episode_t == random_t: next_action = behavior_policy.act( obs, explore=True, explore_eps=1, ) else: next_action = behavior_policy.act(obs, explore=False) obs1, reward, done, _ = self.env.step(next_action) episode_rewards.append(reward) episode_states.append(obs) episode_actions.append(next_action) obs = obs1 if done: for i, (o, a) in enumerate( zip(episode_states[random_t:], episode_actions[random_t:])): weighted_rewards = [ r * self.gamma**j for j, r in enumerate(episode_rewards[random_t + i:]) ] reward_to_go = sum(weighted_rewards) self.replay_buffer_mc.add( o, a, reward_to_go, None, None, ) # Update model. obses_t, actions, rewards, _, _ = self.replay_buffer_mc.sample( self.batch_size) weights = np.ones_like(rewards) td_errors = self.train_mc(obses_t, actions, rewards, weights) # print(rewards) # print(td_errors) #print(self.get_q_values(o)[a], reward_to_go) # print('----') simulated_t = t - len(episode_rewards) + random_t + i if simulated_t % self.train_log_freq == 0: self._log_training_details( episode=ep, t=simulated_t, td_max=np.max(np.abs(td_errors)), td_mean=np.mean(np.abs(td_errors)), smoothing=True, ) # Save model if (self.checkpoint_freq is not None and simulated_t % self.checkpoint_freq == 0): if self.print_freq is not None: logger.log("Saving model due to smoothing") # TODO: Check scope addition save_variables(self.model_file, scope=self.scope) # save_variables(self.model_file) self.model_saved = True obs = self.env.reset() episode_rewards = [] episode_states = [] episode_actions = [] ep += 1 k_random_actions, random_t = get_random_k_t() """ # Finish obs = obs1 self.t += 1 if done: self.episode_rewards.append(0.0) self.training_episode += 1 obs = self.env.reset() """ # TODO: Check that model isn't getting worse? # TODO: Reload last best saved model like in self.end_learning? @property def mean_100ep_reward(self): return round(np.mean(self.episode_rewards[-101:-1]), 1) @property def mean_100ep_discounted_reward(self): return round(np.mean(self.discounted_episode_rewards[-101:-1]), 1) @property def mean_100ep_start_value(self): return round(np.mean(self.start_values[-100:]), 1) @property def mean_100ep_lunar_crashes(self): return round(np.mean(self.lunar_crashes[-100:]), 1) @property def mean_100ep_lunar_goals(self): return round(np.mean(self.lunar_goals[-100:]), 1) @property def mean_100ep_length(self): return round(np.mean(self.episode_lengths[-100:]), 1) def update(self, s, a, s1, r, done, verbose=False, freeze_buffer=False): # get_session() obs = s new_obs = s1 action = a rew = r # Store transition in the replay buffer. if not freeze_buffer: self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 self.discounted_episode_rewards[-1] += rew * \ self.gamma ** self.episode_t if self.start_values[-1] is None: self.start_values[-1] = max(self.get_q_values(s)) if rew == -100: self.lunar_crashes[-1] = 1 elif rew == 100: self.lunar_goals[-1] = 1 mean_100ep_reward = self.mean_100ep_reward td_errors = None if self.t > self.learning_starts and self.t % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t), ) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.prioritized_replay: new_priorities = np.abs(td_errors) + \ self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) if self.t > self.learning_starts and self.t % self.target_network_update_freq == 0: # Update target network periodically. self.update_target() saved = False if (self.checkpoint_freq is not None and self.t > self.learning_starts and self.training_episode > 100 and self.t % self.checkpoint_freq == 0): if (self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward or (self.save_reward_threshold is not None and mean_100ep_reward >= self.save_reward_threshold)): saved = True if self.print_freq is not None: logger.log( "Saving model due to mean reward increase (or mean reward above {}): {} -> {}" .format( self.save_reward_threshold if self.save_reward_threshold is not None else 'NULL', self.saved_mean_reward, mean_100ep_reward)) # TODO: Check scope addition save_variables(self.model_file, scope=self.scope) # save_variables(self.model_file) self.model_saved = True self.saved_mean_reward = mean_100ep_reward if self.t > self.learning_starts and self.t % self.train_log_freq == 0: self._log_training_details( episode=self.training_episode, t=self.t, td_max=np.max(np.abs(td_errors)), td_mean=np.mean(np.abs(td_errors)), r_mean=mean_100ep_reward, r_mean_discounted=self.mean_100ep_discounted_reward, v_mean=self.mean_100ep_start_value, n_crashes_mean=self.mean_100ep_lunar_crashes, n_goals_mean=self.mean_100ep_lunar_goals, saved_model=saved, ) self.t += 1 self.episode_t += 1 if done: self.start_values.append(None) self.episode_rewards.append(0.0) self.episode_lengths.append(0) self.lunar_crashes.append(0) self.lunar_goals.append(0) self.discounted_episode_rewards.append(0.0) self.training_episode += 1 self.episode_t = 0 def end_learning(self): if self.model_saved: if self.print_freq is not None: logger.log("Restored model with mean reward: {}".format( self.saved_mean_reward)) # TODO: Check scope addition load_variables(self.model_file, scope=self.scope) # load_variables(self.model_file) def close(self): if self.td is not None: import shutil shutil.rmtree(self.td)
def evaluate(self, num_episodes, render=False): with U.make_session(NUM_CORES): self.t0 = time.time() env = self.env.env # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() self.episode_count += 1 state = env.reset() self.scores = [0.0] episode_q = [] for t in itertools.count(): action = act(state[None], update_eps=exploration.value(t))[0] observation, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, observation, float(done)) state = observation self.scores[-1] += reward episode_q.append(float(debug['q_values'](state[None]).max())) if render: env.render() if done: print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:]))) self.evaluation.info['q_values'].append(np.mean(episode_q)) if len(self.scores) >= num_episodes: return self.final_evaluation() state = env.reset() episode_q = [] self.scores.append(0) if self.env.solved(self.scores): self.evaluation.info['solved'] = len(self.scores) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() U.reset() return self.final_evaluation()