def main(): env = gym.make("LunarLander-v2") timestamp = '{:%Y-%m-%d-%H:%M}'.format(datetime.datetime.now()) o_dir = "LunarLander-v2/{}/models".format(timestamp) if not os.path.exists(o_dir): os.makedirs(o_dir) nof_episodes = 500 # 8 values in [0, 1] state_size = env.observation_space.shape[0] # 0, 1, 2, 3 action_size = env.action_space.n agent = DeepQAgent(state_size, action_size, model=2) batch_size = 32 for episode in range(nof_episodes): state = env.reset() state = np.reshape(state, [1, state_size]) done = False t = 0 episode_reward = 0 # Iterate over the timesteps while not done: env.render() # Instruct the agent to choose an action based on the current state of the environment # This may be a random action depending on the value of the exploration_rate(epsilon) action = agent.act(state) # Execute said action next_state, reward, done, _ = env.step(action) episode_reward += reward next_state = np.reshape(next_state, [1, state_size]) agent.memorize(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, time: {}, total_reward: {}".format( episode, nof_episodes - 1, t, episode_reward)) t += 1 if len(agent.memory) / batch_size > 1: agent.train(batch_size) # Save model after training if episode % batch_size == 1: agent.save(o_dir + "/model_" + str(episode) + ".hdf5")
def main(_): if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") assert FLAGS.checkpoint_dir != '', 'Checkpoint directory must be specified' if not FLAGS.to_train and not os.path.isfile( os.path.join(FLAGS.checkpoint_dir, 'ckpt.index')): raise Exception( "Checkpoint directory must contain a trained model to do testing") gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction), allow_growth=True) sess_config = tf.ConfigProto( log_device_placement=False, allow_soft_placement=FLAGS.allow_soft_placement, gpu_options=gpu_options) with tf.Session(config=sess_config) as sess: config = get_config(FLAGS) env = AtariEnvironment(config) agent = DeepQAgent(env, sess, config) if config.to_train: agent.train() else: agent.play()
import numpy as np import time import actor from env.puzzle import PAD from agent import DeepQAgent shape = [5,6] moves = 100 board = PAD(shape=shape, max_moves=moves, show=False) print('board set up') print('setting up agent') agent = DeepQAgent(board, n_moves=moves, batch_size=64, memory=128, sample_mode='e_greedy', reward_type='combo') print('agent set up') print('Max moves: ', agent.n_moves) agent.observe() actor.train_loop(agent) ## Replace board so we can watch some play board = PAD(shape=shape, max_moves=moves, show=True, sleep_time=0.05) agent.swap_board(board) actor.run_loop(agent)
from agent import TabularQAgent, DeepQAgent import numpy as np import gym import matplotlib.pyplot as plt from utils import plot_learning_curve env = gym.make('CartPole-v1') n_actions = env.action_space.n n_states = env.observation_space.shape A = DeepQAgent(lr=0.001, gamma=0.9, eps_max=1.0, eps_min=0.01, eps_dec=0.9999995, n_actions=n_actions, n_states=n_states, input_dims=n_states) n_episodes = 10000 win_pct_list = [] scores = [] eps_history = [] for i in range(n_episodes): done = False score = 0 s = env.reset() done = False while not done:
def main(): print("Start Atari games") environment_name = "PongNoFrameskip-v4" env = make_env(environment_name) best_score = -np.inf load_checkpoint = False n_games = 500 lr = 0.0001 epsilon = 1 gamma = 0.99 input_dims = env.observation_space.shape n_actions = env.action_space.n eps_min = 0.01 eps_dec = 5e-7 replace = 1000 algo = None mem_size = 50000 batch_size = 32 chkpt_dir = "models/" algo = "DeepQAgent" agent = DeepQAgent(lr, n_actions, input_dims, chkpt_dir, epsilon, gamma, mem_size, batch_size, eps_min, eps_dec, replace, algo, environment_name) if load_checkpoint: agent.load_models() fname = agent.algo + "_" + agent.env_name + '_lr' + str( agent.lr) + "_" + str(n_games) + "_games" figure_file = "plots/" + fname + ".png" n_steps = 0 scores, eps_history, steps_array = [], [], [] for i in range(n_games): done = False score = 0 observation = env.reset() while not done: action = agent.get_action(observation) new_observation, reward, done, info = env.step(action) score += reward if not load_checkpoint: agent.store_transition(observation, action, reward, new_observation, int(done)) agent.learn() observation = new_observation n_steps += 1 scores.append(score) steps_array.append(n_steps) avg_score = np.mean(scores[-100:]) print( "episode ", i + 1, "score: ", score, "average score %.1f best score %.1f epsilon %.2f" % (avg_score, best_score, agent.epsilon), " steps ", n_steps) if avg_score > best_score: if not load_checkpoint: agent.save_models() best_score = avg_score eps_history.append(agent.epsilon) plot_learning_curve(steps_array, scores, eps_history, figure_file) print("End Atari games")
from agent import DDPG, DeepQAgent from environment import Environment done_comparison_data = { 'coords_done_fail': [45, 60, 118, 180], 'coords_done_success': [5, 16, 122, 174], 'img_done_fail': 'data/s8_cut_try_again.png', 'img_done_success': 'data/game_score_s8.png', 'restart_btn_coords': [640, 1110], 'restart_ongame': [(2764, 93), (2624, 552)], } scores = { 'coords_diamonds_gathered': [11, 27, 25, 35], 'digits_mask_addr': 'data/digits', 'match_threshold': 10, 'state_area': [28, 112, 0, 296], 'time_importance': 0.7, 'diamonds_importance': 0.3, 'episode_time_limit': 60, 'diamonds_total': 7 } env = Environment(device_ref_elements_data={ 'done_comparison_data': done_comparison_data, 'scores': scores }) #agent = DDPG(env) agent = DeepQAgent(env) train(agent, env, episode_seconds_constrain=45)