def get_random_state(game_name, random_steps=20): gym_env = gym.make(game_name) env = CustomGym(gym_env, 'SpaceInvaders-v0') # Reset the environment and then take 20 random actions state = env.reset() for _ in range(random_steps): state, _, _, _ = env.step(random.randrange(env.action_size)) return state
def main(argv): save_path = None T = None game_name = None try: opts, args = getopt.getopt(argv, "g:s:T:") except getopt.GetoptError: print "Usage: python run_agent.py -g <game name> -s <save path> -T <T>" for opt, arg in opts: if opt == '-g': game_name = arg elif opt == '-s': save_path = arg elif opt == '-T': T = arg if game_name is None: print "No game name specified" sys.exit() if save_path is None: print "No save path specified" sys.exit() if T is None: print "No T specified" sys.exit() print "Reading from", save_path print "Running agent" env = CustomGym(game_name) run_agent(env, save_path, T, game_name)
def qlearn(game_name, nb_threads=8): processes = [] envs = [] for _ in range(nb_threads): gym_env = gym.make(game_name) env = CustomGym(gym_env) envs.append(env) T_queue = Queue.Queue() T_queue.put(0) with tf.Session() as sess: agent = Agent(session=sess, action_size=envs[0].action_size, h=84, w=84, channels=STATE_FRAMES, optimizer=tf.train.AdamOptimizer(INITIAL_LEARNING_RATE)) sess.run(tf.global_variables_initializer()) summary = Summary('tensorboard', agent) for i in range(NUM_THREADS): processes.append(threading.Thread(target=async_trainer, args=(agent, envs[i], sess, i, T_queue, summary,))) for p in processes: p.daemon = True p.start() while not training_finished: sleep(0.01) for p in processes: p.join()
def a3c(game_name, num_threads=8, restore=None, save_path='model'): processes = [] envs = [] for _ in range(num_threads+1): gym_env = gym.make(game_name) if game_name == 'CartPole-v0': env = CustomGymClassicControl(gym_env) else: print('Assuming ATARI game and playing with pixels') env = CustomGym(gym_env, game_name, nb_frames=1) envs.append(env) # Separate out the evaluation environment evaluation_env = envs[0] envs = envs[1:] with tf.Session() as sess: agent = Agent(session=sess, action_size=envs[0].action_size, model='mnih-lstm', optimizer=tf.train.AdamOptimizer(INITIAL_LEARNING_RATE)) # Create a saver, and only keep 2 checkpoints. saver = tf.train.Saver(max_to_keep=2) T_queue = queue.Queue() # Either restore the parameters or don't. if restore is not None: saver.restore(sess, save_path + '-' + str(restore)) last_T = restore print('T was:', last_T) T_queue.put(last_T) else: sess.run(tf.global_variables_initializer()) T_queue.put(0) summary = Summary(save_path, agent) # Create a process for each worker for i in range(num_threads): processes.append(threading.Thread(target=async_trainer, args=(agent, envs[i], sess, i, T_queue, summary, saver, save_path,))) # Create a process to evaluate the agent processes.append(threading.Thread(target=evaluator, args=(agent, evaluation_env, sess, T_queue, summary, saver, save_path,))) # Start all the processes for p in processes: p.daemon = True p.start() # Until training is finished while not training_finished: sleep(0.01) # Join the processes, so we get this thread back. for p in processes: p.join()
def run(double_dqn, task_num): title = "Double_Deep_Q" if double_dqn else "Deep_Q" env = Monitor(CustomGym( agentXY, goalXY, tasks[task_num][0], tasks[task_num][1], title=f"{title}_Task_{task_num + 1}", ), filename=None) model = DQN(MlpPolicy, env, verbose=verbose, gamma=gamma, learning_rate=learning_rate, double_q=bool(double_dqn)) while len(env.get_episode_rewards()) < episodes: model.learn(total_timesteps=time_steps) env.save_csv() env.destroy()
def play(agent, game_name, render=True, num_episodes=10, fps=5.0, monitor=True): gym_env = gym.make(game_name) if monitor: print(gym_env) gym_env = wrappers.Monitor(gym_env, 'videos/-v0') print(gym_env) print(game_name) env = CustomGym(game_name) desired_frame_length = 1.0 / fps episode_rewards = [] episode_vals = [] t = 0 for ep in range(num_episodes): print("Starting episode", ep) episode_reward = 0 state = env.reset() terminal = False current_time = time() while not terminal: policy, value = agent.get_policy_and_value(state) action_idx = np.random.choice(agent.action_size, p=policy) state, reward, terminal, _ = env.step(action_idx) if render: env.render() t += 1 episode_vals.append(value) episode_reward += reward # Sleep so the frame rate is correct next_time = time() frame_length = next_time - current_time if frame_length < desired_frame_length: sleep(desired_frame_length - frame_length) current_time = next_time episode_rewards.append(episode_reward) if monitor: gym_env.monitor.close() return episode_rewards, episode_vals