def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print(tf.trainable_variables()) save_path='/home/noob/retro-noob/rainbow/params/params' utils.save_state(save_path+'_tf_saver') with tf.variable_scope('model'): params = tf.trainable_variables() ps = sess.run(params) joblib.dump(ps, save_path + '_joblib')
def save(self, path): """Save model to a pickle located at `path`""" with tempfile.TemporaryDirectory() as td: save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: dill.dump((model_data), f)
def save(self, path=None): """Save model to a pickle located at `path`""" if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)
def maybe_save_model(savedir, container, state): """This function checkpoints the model and state of the training algorithm.""" if savedir is None: return start_time = time.time() model_dir = "model-{}".format(state["num_iters"]) save_state(os.path.join(savedir, model_dir, "saved")) if container is not None: container.put(os.path.join(savedir, model_dir), model_dir) relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True) if container is not None: container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip') relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl')) if container is not None: container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl') logger.log("Saved model in {} seconds\n".format(time.time() - start_time))
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() results_file = open('results.csv', 'w', newline='') results_writer = csv.writer(results_file, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) results_writer.writerow(['Episode', 'Reward']) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() results_writer.writerow([num_episodes, mean_100ep_reward]) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_state(model_file) return act
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph if(env.is_single): observation_space_shape = env.observation_space.shape num_actions = env.action_space.n else: observation_space_shape = env.observation_space[0].shape num_actions = env.action_space[0].n num_agents=env.agentSize def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size*num_agents, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size*num_agents) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action=[] qval=[] for i in range(num_agents): prediction=act(np.array(obs[i])[None], update_eps=update_eps, **kwargs) #print(prediction[0],prediction[1][0]) action.append(prediction[0][0]) qval.append(prediction[1][0]) env_action = action reset = False new_obs, rew, done, _ = env.step(env_action,qval) # Store transition in the replay buffer. for i in range(num_agents): replay_buffer.add(obs[i], action[i], rew, new_obs[i], float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t*num_agents % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None #print(obses_t.shape,actions.shape,rewards.shape,obses_tp1.shape,dones.shape) td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act,episode_rewards
def save_model(dict_state): save_state("saved_model/model.ckpt") relatively_safe_pickle_dump(dict_state, "saved_model/model_state.pkl.zip", compression=True)
reward_test, profit = run_test(env=env, act=act) print("Total profit test: > {}".format(round(profit, 2))) print("Avg profit per trade test > {}".format( round(reward_test, 3))) print("-------------------------------------") except Exception as e: print("Exception: ", e) # Update target network periodically. obs = env.reset() episode_rewards.append(0) if is_solved: # Show off the result UT.save_state('./test_model/test_model') env.generate_summary_stats() run_test(env, act, final_test=True) break else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 500: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if t % 500 == 0: UT.save_state('./test_model/test_model') update_target()
def learn(env, q_func, policy_fn, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) scope = "ampi" reuse=None grad_norm_clipping=None num_actions=env.action_space.n optimizer_q=tf.train.AdamOptimizer(learning_rate=lr) optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr) act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # add ob_space = env.observation_space ac_space = env.action_space pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func") pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # Q_{train}(a,s) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # y_j act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1}) q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1})) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # Regression loss td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # argmax_a Q_{target}(s_j, a) z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a') # classification loss cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=z_j) # Q optimization if grad_norm_clipping is not None: gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients_qq): if grad is not None: gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_q = optimizer_q.apply_gradients(gradients_q) else: optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars) # pi optimization if grad_norm_clipping is not None: gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars) for i, (grad, var) in enumerate(gradients_pi): if grad is not None: gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_pi = optimizer_pi.apply_gradients(gradients_pi) else: optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars) # update_target Q update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_target pi update_target_pi = [] for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name), sorted(target_pi_func_vars, key=lambda v: v.name)): update_target_pi.append(var_target.assign(var)) update_target_pi = tf.group(*update_target_pi) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, cl_error], updates=[optimize_q, optimize_pi] ) update_target = U.function([], [], updates=[update_target_expr, update_target_pi]) q_values = U.function([obs_t_input], q_t) debug = {'q_values': q_values} # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = env.action_space.sample() # not used, just so we have the datatype stochastic=True ac1, vpred1 = act(stochastic, np.array(obs)[None]) action = ac1[0] #action, _ = pi.act(stochastic, obs) #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log train and res mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None, demo_replay=[]): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): # return U.BatchInput((64, 64), name=name) return QU.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative obs, xy_per_marine = common.init(env, obs) group_id = 0 reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # custom process for DefeatZerglingsAndBanelings obs, screen, player = common.select_marine(env, obs) action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 new_action = None obs, new_action = common.marine_action(env, obs, player, action) army_count = env._obs[0].observation.player_common.army_count try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[ "available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: #print(e) 1 # Do nothing player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] if (len(player) == 2): if (player[0] > 32): new_screen = common.shift(LEFT, player[0] - 32, new_screen) elif (player[0] < 32): new_screen = common.shift(RIGHT, 32 - player[0], new_screen) if (player[1] > 32): new_screen = common.shift(UP, player[1] - 32, new_screen) elif (player[1] < 32): new_screen = common.shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative group_list = common.init(env, obs) # Select all marines first #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) # U.save_state(model_file) QU.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) # U.load_state(model_file) QU.load_state(model_file) return ActWrapper(act)
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=100, print_freq=15, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None, demo_replay=[]): """Train a deepq model. Parameters ------- q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = TU.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. TU.initialize() update_target() group_id = 0 old_num = 0 reset = True Action_Choose = False player = [] episode_rewards = [0.0] saved_mean_reward = None marine_record = {} obs = env.reset() screen = obs[0].observation["screen"][_UNIT_TYPE] obs, xy_per_marine = common.init(env, obs) with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # custom process for DefeatZerglingsAndBanelings reset = False Action_Choose = not (Action_Choose) if Action_Choose == True: #the first action obs, screen, group_id, player = common.select_marine(env, obs) marine_record = common.run_record(marine_record, obs) else: # the second action action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action = common.check_action(obs, action) new_action = None obs, new_action, marine_record = common.marine_action( env, obs, group_id, player, action, marine_record) army_count = env._obs[0].observation.player_common.army_count try: if army_count > 0 and ( _MOVE_SCREEN in obs[0].observation["available_actions"]): obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: print(new_action) print(e) new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) # get the new screen in action 2 player_y, player_x = np.nonzero( obs[0].observation["screen"][_SELECTED] == 1) new_screen = obs[0].observation["screen"][_UNIT_TYPE] for i in range(len(player_y)): new_screen[player_y[i]][player_x[i]] = 49 #update every step rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST episode_rewards[-1] += rew reward = episode_rewards[-1] if Action_Choose == False: # only store the screen after the action is done replay_buffer.add(screen, action, rew, new_screen, float(done)) mirror_new_screen = common._map_mirror(new_screen) mirror_screen = common._map_mirror(screen) replay_buffer.add(mirror_screen, action, rew, mirror_new_screen, float(done)) if done: obs = env.reset() Action_Choose = False group_list = common.init(env, obs) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() num_episodes = len(episode_rewards) #test for me if num_episodes > old_num: old_num = num_episodes print("now the episode is {}".format(num_episodes)) #test for me if (num_episodes > 102): mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) else: mean_100ep_reward = round(np.mean(episode_rewards), 1) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print("get the log") logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return BatchInput((16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target_x() update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action_x = act_x(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 coord = [action_x, action_y] if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screent = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like(rewards_x), None obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x) td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, test_agent=1e6, param_noise=False, double=True, epsilon=True, eps_val=0.01, alpha_val=0.01, q1=False, n_steps=1, sample=False, piecewise_schedule=False, alpha_epsilon=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. epsilon: if True, runs alpha-DQN Q1: if True, runs Surrogate version, else, runs Expected version. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) def make_fixed_obs_ph(name): return FixedBatchInput(observation_space_shape, batch=batch_size * n_steps, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, make_fixed_obs_ph=make_fixed_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, double_q=double, epsilon=epsilon, eps_val=alpha_val, q1=q1, n_steps=n_steps, batch_size=batch_size, sample=sample, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha, n_steps=n_steps, gamma=gamma) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size, n_steps, gamma) beta_schedule = None # Create the schedule for exploration starting from 1. if piecewise_schedule: exploration = PiecewiseSchedule(endpoints=[(0, 1.0), (1e6, exploration_final_eps), (24e6, 0.01)], outside_value=0.01) else: exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True epinfobuf = deque(maxlen=100) test_flag = False with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True if epsilon: env_action, action_wanted, random_action_flag = act( np.array(obs)[None], update_eps=update_eps, **kwargs) env_action = env_action[0] if q1: action = action_wanted[0] else: action = env_action else: action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, env_action, rew, new_obs, float(done), float(random_action_flag), update_eps) obs = new_obs maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo]) episode_rewards[-1] += rew if done: done_cnt = -1 obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, env_actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, env_actions, rewards, obses_tp1, dones, random_action_flags, eps = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(actions), None if alpha_epsilon: td_errors = train(obses_t, actions, env_actions, rewards, obses_tp1, dones, weights, random_action_flags, update_eps) else: td_error = train(obses_t, actions, env_actions, rewards, obses_tp1, dones, weights, random_action_flags, alpha_val) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() if t > learning_starts and t % test_agent == 0: test_flag = True if done and test_flag: nEpisodes = 50 rewards = deque(maxlen=nEpisodes) for i in range(nEpisodes): obs, done = env.reset(), False episode_rew = 0 reward = 0 maybeepinfo = None while maybeepinfo is None: curr_update_eps = 0.001 if env.unwrapped.ale.getEpisodeFrameNumber( ) > 108000: # Terminates episode by acting randomly curr_update_eps = 0.99999 obs, rew, done, info = env.step( act(obs[None], stochastic=True, update_eps=0.001, optimal_test=optimal_test)[0]) maybeepinfo = info.get('episode') if maybeepinfo: reward = maybeepinfo['r'] rewards.extend([reward]) logger.record_tabular("test_reward_mean", np.mean([rew for rew in rewards])) logger.record_tabular("steps", t) logger.dump_tabular() obs = env.reset() test_flag = False mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: mean_reward = safemean([epinfo['r'] for epinfo in epinfobuf]) logger.record_tabular("episode_reward_mean", mean_reward) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_reward > saved_mean_reward or ( (mean_reward >= saved_mean_reward) and mean_reward > 0): if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_reward act.save() if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_state(model_file) return act
summary_writer = tf.summary.FileWriter(stats_folder + "summaries\\" + method + "\\" + time_string + "\\") set_global_seeds(seed) if model_file == 'None': model_file = os.path.abspath(os.path.dirname(__file__)) + "\\" + stats_folder + "models\\" + method + "\\" + time_string + "\\" + os.path.basename(env_path).split('.')[0] if enjoy_file == 'None': # Train a new model act = None if method == 'dqn': print("Training using DQN...") act = learn_dqn(env_path=env_path, seed=seed, max_steps=max_steps, reward_range=reward_range, base_port=base_port, unity_arguments=unity_arguments, summary_writer=summary_writer, model_file=model_file) elif method == 'a2c': print("Training using A2C...") act = learn_a2c(env_path=env_path, seed=seed, max_steps=max_steps, reward_range=reward_range, base_port=base_port, unity_arguments=unity_arguments, summary_writer=summary_writer) else: print("Unknown method: \"" + method + "\".") print("Saving model to " + model_file + ".") save_state(model_file) else: # Load and enjoy an existing model if method == 'dqn': print("Enjoying using DQN...") enjoy_dqn(env_path=env_path, seed=seed, max_steps=max_steps, base_port=base_port, unity_arguments=unity_arguments, model_file=enjoy_file) elif method == 'a2c': print("Loading A2C models not supported yet...") else: print("Unknown method: \"" + method + "\".")