def test_single_training(): numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) env = Environment(numberOfCells, deterministic=True) agent = DQNAgent(state_size=env.state_size, action_size=Actions.action_size, deterministic=True, batch_size=24, memory_limit=2000) state = env.reset(startingPosition, foodPosition) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) loss10 = -1 action10 = -1 maxsteps = 10 for step in range(maxsteps): action = agent.get_exploration_action() next_state, reward, done = env.step(action, food_position=(1, 1)) assert(not done) full_next_state = agent.get_convolutional_layers(next_state) assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) agent.save_transition(full_state, action, reward, full_next_state, done) current_loss = agent.train() full_state = full_next_state loss10 = current_loss action10 = action assert(loss10 == 0.006804642267525196) assert(action10 == 0)
def test_smoke(): # just runs the code - no assetions numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) env = Environment(numberOfCells) agent = DQNAgent(state_size=env.state_size, action_size=Actions.action_size, deterministic=True, batch_size=24, memory_limit=2000) state = env.reset(startingPosition, foodPosition) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) maxsteps = 2 for step in range(maxsteps): action = agent.get_exploration_action() next_state, reward, done = env.step(action, food_position=(1, 1)) full_next_state = agent.get_convolutional_layers(next_state) assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) agent.save_transition(full_state, action, reward, full_next_state, done) current_loss = agent.train() if (step == 0): action1 = action loss1 = current_loss full_state = full_next_state loss2 = current_loss action2 = action
def __init__(self, env, do_render, num_threads, gamma, lr, global_max_episode): state_size, action_size = env.observation_space.shape[ 0], env.action_space.n self.qnetwork_global = QNetwork(state_size, action_size) #.to(device) self.qnetwork_global.share_memory() self.qnetwork_target = QNetwork(state_size, action_size) #.to(device) self.qnetwork_target.share_memory() self.agents = [ DQNAgent(id=id, env=env, do_render=do_render, state_size=state_size, action_size=action_size, n_episodes=global_max_episode, lr=lr, gamma=gamma, update_every=UPDATE_EVERY + num_threads, global_network=self.qnetwork_global, target_network=self.qnetwork_target) for id in range(num_threads) ]
def main(): config = Config() env = Environment(config) agent = DQNAgent(config) trainer = Trainer(config, env, agent) trainer.train() trainer.play()
def test_multiepisode_training(): numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) env = Environment(numberOfCells, deterministic=True) state_size = env.state_size action_size = Actions.action_size # 3 agent = DQNAgent(state_size=state_size, action_size=action_size, deterministic=True, batch_size=24, memory_limit=2000) losses = [-1, -1, -1, -1] done = False episodes = 4 maxsteps = 9 for e in range(episodes): state = env.reset(startingPosition, foodPosition) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) loss = 0 for step in range(maxsteps): action = agent.get_exploration_action() next_state, reward, done = env.step(action, food_position=(1, 1)) # generation on (1, 1) happens once over the test full_next_state = agent.get_convolutional_layers(next_state) assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) agent.save_transition(full_state, action, reward, full_next_state, done) current_loss = agent.train() loss += current_loss full_state = full_next_state losses[e] = loss assert(losses[0] == 3.9618697417899966) assert(losses[1] == 0.044194952584803104) assert(losses[2] == 0.1333141174982302) assert(losses[3] == 2.834151452407241)
def main(): """Main""" env_id = 'SpaceInvaders-v0' weight_fname = '/home/matthieu/temp/test.h5' env = ProcessedEnvironnement( env_id, outdir='/home/matthieu/temp/random-agent-results', wrappers_cond=True) env.seed(0) network = ConvNet(input_shape=(84, 84, 1), nbr_action=env.action_space.n, weight_fname=weight_fname) agent = DQNAgent(action_space=env.action_space, network=network, obs_shape=(84, 84, 1), buffer_size=6, decay=0.0, epsilon=0.9) episode_count = 1 reward = 0 action_repetition_rate = 4 action = 0 for i in range(episode_count): ob = env.reset() done = True counter = 0 while True: if counter % action_repetition_rate == 0: action = agent.act(ob, reward, done) print(action) ob, reward, done, _ = env.step(action) counter += 1 if done: break # Close the env and write monitor result info to disk env.close()
def train(network=None, expert_data_path=None): env = make_env() env_spec = acme.make_environment_spec(env) if network is None: network = make_dqn(env_spec.actions.num_values) expert_data = None if expert_data_path is not None: with open(expert_data_path, "rb") as handle: expert_data = pickle.load(handle) num_timesteps = np.sum([1 + len(ep["mid"]) for ep in expert_data]) print(f"Using expert data from {expert_data_path}. " f"Episodes: {len(expert_data)}. Timesteps: {num_timesteps}.") agent = DQNAgent(environment_spec=env_spec, network=network, batch_size=32, learning_rate=1e-4, logger=loggers.NoOpLogger(), min_replay_size=1000, max_replay_size=int(1e5), target_update_period=2500, epsilon=tf.Variable(0.025), n_step=20, discount=0.97, expert_data=expert_data) loop = EnvironmentLoop(environment=env, actor=agent, module2save=network) reward_history = loop.run(num_steps=int(1e6), render=True, checkpoint=True, checkpoint_freq=15) avg_hist = [np.mean(reward_history[i:(i+50)]) for i in range(len(reward_history) - 50)] plt.plot(list(range(len(avg_hist))), avg_hist) plt.show() env.close() return network
def main(): random.seed(SEED) # Create agent-directory execution_time = str(round(time.time())) agent_dir = os.path.join("agents", ALGORITHM, ENVIRONMENT + "_" + execution_time) os.makedirs(agent_dir) # Initialize utils, environment and agent utils = Utils(agent_dir, FRAMES_PER_EPOCH, EPOCHS * FRAMES_PER_EPOCH) env = gym.make(ENVIRONMENT) try: env.env.frameskip = FRAMESKIP env.env.ale.setFloat("repeat_action_probability", REPEAT_ACTION_PROB) if ALGORITHM == 'MFEC': if AGENT_PATH: agent = MFECAgent.load(AGENT_PATH) else: agent = MFECAgent( ACTION_BUFFER_SIZE, K, DISCOUNT, EPSILON, SCALE_HEIGHT, SCALE_WIDTH, STATE_DIMENSION, range(env.action_space.n), SEED, ) else: agent = DQNAgent(env.action_space.n) if AGENT_PATH: agent.load(AGENT_PATH) run_algorithm(agent, agent_dir, env, utils) finally: utils.close() env.close()
env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env = EpisodicLifeEnv(env) env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 4) replay_buffer = ReplayBuffer(args.replay_buffer_size) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=args.use_double_dqn, lr=args.lr, batch_size=args.batch_size, gamma=args.gamma ) eps_timesteps = args.eps_fraction * float(args.num_steps) episode_rewards = [0.0] loss = [0.0] state = env.reset() for t in range(args.num_steps): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = args.eps_start + fraction * (args.eps_end - args.eps_start) sample = random.random() if sample > eps_threshold:
def main(): config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, retro=True, realtime_mode=False, config=config) print(env.observation_space) print(env.action_space) hyper_params = { "seed": 6, # which seed to use "replay-buffer-size": int(5e3), # replay buffer size "learning-rate": 1e-4, # learning rate for Adam optimizer "discount-factor": 0.99, # discount factor "num-steps": int(1e6), # total number of steps to run the environment for "batch-size": 32, # number of transitions to optimize at the same time "learning-starts": 5000, # number of steps before learning starts "learning-freq": 1, # number of iterations between every optimization step "use-double-dqn": True, # use double deep Q-learning "target-update-freq": 1000, # number of iterations between every target network update "eps-start": 1.0, # e-greedy start threshold "eps-end": 0.01, # e-greedy end threshold "eps-fraction": 0.05, # fraction of num-steps "print-freq": 10 } np.random.seed(hyper_params["seed"]) random.seed(hyper_params["seed"]) #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip" #env = gym.make(hyper_params["env"]) env.seed(hyper_params["seed"]) #env = NoopResetEnv(env, noop_max=30) #env = MaxAndSkipEnv(env, skip=4) #env = EpisodicLifeEnv(env) #env = FireResetEnv(env) # env = WarpFrame(env) env = PyTorchFrame(env) # env = ClipRewardEnv(env) # env = FrameStack(env, 4) replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"]) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params["use-double-dqn"], lr=hyper_params["learning-rate"], batch_size=hyper_params["batch-size"], gamma=hyper_params["discount-factor"] ) model_num = 500 agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device))) eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"]) episode_rewards = [0.0] ep_nums = model_num state = env.reset() for t in range(hyper_params["num-steps"]): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"]) sample = random.random() # TODO # select random action if sample is less equal than eps_threshold # take step in env # add state, action, reward, next_state, float(done) to reply memory - cast done to float # add reward to episode_reward if sample > eps_threshold: action = agent.act(np.array(state)) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) ep_nums += 1 if ep_nums % 50 == 0: agent.save_models(ep_nums) plot(episode_rewards,ep_nums) if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0: agent.optimise_td_loss() if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0: agent.update_target_network() num_episodes = len(episode_rewards) if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[ "print-freq"] == 0: mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) print("********************************************************") print("steps: {}".format(t)) print("episodes: {}".format(num_episodes)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("% time spent exploring: {}".format(int(100 * eps_threshold))) print("********************************************************") #if done and ep_nums % 10 == 0: # animate(env,agent,"anim/progress_"+str(ep_nums)) # state = env.reset() animate(env,agent,"anim/final") env.close()
# We can use proportional or rank-based prioritized replay (proportional seems to be prefered by many papers) # Simple, non-prioritized replay is also implemented alpha_scheduler = dqn.annealing_schedules.Constant(0.7) beta_scheduler = dqn.annealing_schedules.Constant(0.5) memory = dqn.experience_replay.Proportional(capacity=50000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) ##memory = dqn.experience_replay.RankBased(capacity=50000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) ##memory = dqn.experience_replay.Simple(capacity=50000) # Below we add n-step learning with the parameter n-step # Not yet supported: Frame skipping will be added in the future agent = DQNAgent(network=q_func, observation_space=env.observation_space, action_space=env.action_space, action_selection=action_selection, loss=loss, update_target=update_target, memory=memory, n_step=3, update_target_network_frequency=2000) agent.train(env, num_timesteps=num_steps, render=False) # We can save and load an agent # Note: Currently this only saves the weights of the network -- the entire agent must be recreated (or reused, as would happen here) before calling load ##agent.save('/tmp/save_test/test') ##agent.load('/tmp/save_test/test')
def train_snake(): # todo: put all these parameters using a configuration file numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) max_steps_allowed = 1000 env = Environment(numberOfCells) state_size = env.state_size #(numberOfCells x numberOfCells) action_size = Actions.action_size # 3 agent = DQNAgent(state_size=state_size, action_size=action_size, batch_size=32, memory_limit=6000, number_of_channels=5) episodes = 30000 decay = 0.9 / episodes * 2 # changes epsilon : explore vs exploit epochs = [] losses = [] steps_list = [] with open('training_data', 'w') as f: for e in range(episodes): state = env.reset(startingPosition) #print('state array reset: \n', state) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) loss = 0.0 steps = 0 done = False episode_reward = 0 while not done: # state at this point is just a 2D array action = agent.get_action(full_state) #action = agent.get_raction() #print('action chosen: ', action) # step onto the next state next_state, reward, done = env.step(action) #print('state array after step ', steps, ' : \n', next_state) #print('reward returned: ', reward) #print('next state: ', next_state) # we store the next_state in (1,H,W,C) full_next_state = agent.get_convolutional_layers(next_state) #print('full next state: \n:', full_next_state) #assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) # save S,A,R,S' to experience # full states are a snapshot - copies of the state agent.save_transition(full_state, action, reward, full_next_state, done) episode_reward += reward # use alternative policy to train model - rely on experience only current_loss = agent.train() #print('current_loss: ', current_loss) loss += current_loss full_state = full_next_state # limit max steps - avoid something bad steps += 1 if steps >= max_steps_allowed: done = True # next episode if agent.epsilon > 0.1: agent.epsilon -= decay # agent slowly reduces exploring print( 'episode: {:5d} steps: {:3d} epsilon: {:.3f} loss: {:8.4f} reward: {:3d} fruits: {:2d}' .format(e, steps, agent.epsilon, loss, episode_reward, env.fruits_eaten)) f.write('{:5d} {:3d} {:8.4f} {:4d} {:2d}\n'.format( e, steps, loss, episode_reward, env.fruits_eaten)) agent.model.save('trained_snake.model')
action='store', help="Please specify the agent you wish to use, either DQN or A3C", required=True) parser.add_argument( "-n", "--mode", type=str, action='store', help="Please specify the mode you wish to run, either train or eval", required=True) args = parser.parse_args() print(args) if args.model == 'DQN': agent = DQNAgent() if args.mode == 'train': agent.train() if args.mode == 'eval': agent.Evaluate() if args.model == 'A3C': agent = A3CGlobalAgent() if args.mode == 'train': agent.train() if args.mode == 'eval': agent.Evaluate()
import dqn.experience_replay import tensorflow.contrib.layers as layers env = gym.make('CartPole-v1') num_steps=200000 # Here we combine the same improvements from Rainbow, but use QR instead of C51 # Note that we are still using a DistributionalQNetwork, but this network uses n as the number of quantiles rather than the number of atoms # TODO: Do we want to allow noisy_net=False ? Does this make sense or not ? q_func = nn.DistributionalQNetwork([64], env.action_space.n, n=75, noisy_net=True, dueling=[32]) epsilon_scheduler = dqn.annealing_schedules.Constant(0) action_selection = dqn.algorithms.EpsilonGreedy(epsilon_scheduler) loss = dqn.algorithms.QuantileRegressionLoss() update_target = dqn.algorithms.HardUpdate() alpha_scheduler = dqn.annealing_schedules.Constant(0.7) beta_scheduler = dqn.annealing_schedules.Constant(0.5) memory = dqn.experience_replay.Proportional(capacity=100000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) agent = DQNAgent(network=q_func, observation_space=env.observation_space, action_space=env.action_space, action_selection=action_selection, loss=loss, update_target=update_target, memory=memory, n_step=3, update_target_network_frequency=100) agent.load('save/qr_dqn') agent.run(env, num_timesteps=num_steps, render=True)
realtime_mode=False, config=config) env.seed(random_seed) # Run with specific wrappers # # This is the only Wrapper we used, as the others were didn't add enough value env = PyTorchFrame(env) # env = FrameStack(env, 3) # env = HumanActionEnv(env) # Create Agent to Train replay_buffer = ReplayBuffer(int(5e3)) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=True, lr=args.lr, batch_size=hyper_params["batch-size"], gamma=hyper_params["discount-factor"], ) # If we have pretrained weights, load them if(args.checkpoint): print(f"Loading a policy - { args.checkpoint } ") agent.policy_network.load_state_dict(torch.load(args.checkpoint)) eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"]) episode_rewards = [0.0] step_count = 0 state = env.reset() for t in range(hyper_params["num-steps"]):
import gym import numpy as np from dqn.agent import DQNAgent from dqn.agent import EPISODES, EPISODE_LENGTH, BATCH_SIZE environment_name = 'CartPole-v1' environment = gym.make(environment_name) environment.max_episode_steps = EPISODE_LENGTH n_actions = environment.action_space.n n_state_features = environment.observation_space.shape[0] # Initialize DQN agent agent = DQNAgent(n_state_features, n_actions) for episode in range(EPISODES): state = environment.reset() state = np.reshape(state, [1, n_state_features]) for t in range(EPISODE_LENGTH): # Predict next action using NN Value Function Approximation action = agent.get_action(state) # Interact with the environment and observe new state and reward next_state, reward, terminated, info = environment.step(action) # Huge negative reward if failed if terminated:
env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 4) env = gym.wrappers.Monitor( env, './video/', video_callable=lambda episode_id: episode_id % 50 == 0, force=True) replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"]) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params["use-double-dqn"], lr=hyper_params['learning-rate'], batch_size=hyper_params['batch-size'], gamma=hyper_params['discount-factor'], device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), dqn_type=hyper_params["dqn_type"]) if (args.load_checkpoint_file): print(f"Loading a policy - { args.load_checkpoint_file } ") agent.policy_network.load_state_dict( torch.load(args.load_checkpoint_file)) eps_timesteps = hyper_params["eps-fraction"] * \ float(hyper_params["num-steps"]) episode_rewards = [0.0] state = env.reset()
# helper method for reshaping the cartpole observation def reshape(state): return np.reshape(state, [1, 4]) if __name__ == '__main__': tf.compat.v1.disable_eager_execution() max_score = 0 n_episodes = 5000 max_env_steps = 1000 env = gym.make('CartPole-v0') agent = DQNAgent(env=env, net=NN(alpha=0.001, decay=0.0001), memory=ReplayMemory(size=100000)) if max_env_steps is not None: env._max_episode_steps = max_env_steps for e in range(n_episodes): # reset the env state = reshape(env.reset()) done = False score = 0 # play until env done while not done: action = agent.act(state) next_state, reward, done, _ = env.step(action) # env.render()
type=int) args, unknowns = cmdline_parser.parse_known_args() history_length = args.history_length num_actions = args.num_actions Q = CNN(state_dim, num_actions, history_length, hidden=256, lr=1e-3) Q_target = CNNTargetNetwork(state_dim, num_actions, history_length, hidden=256, lr=1e-3) agent = DQNAgent(Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05) agent.load("./models_carracing/dqn_agent.ckpt") n_test_episodes = 15 episode_rewards = [] for i in range(n_test_episodes): stats = run_episode(env, agent, deterministic=True, do_training=False, rendering=True) episode_rewards.append(stats.episode_reward)
end=0.02, num_steps=31 / 32 * num_steps) action_selection = dqn.algorithms.GaussianRandomProcess(stddev_scheduler) loss = dqn.algorithms.NAFLoss( ) #TODO: ADD IN ALL OPTIONS HERE AND IN OTHER ONES update_target = dqn.algorithms.SoftUpdate(tau=0.001) alpha_scheduler = dqn.annealing_schedules.Constant(0.7) beta_scheduler = dqn.annealing_schedules.Constant(0.5) memory = dqn.experience_replay.Proportional(capacity=1000000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) agent = DQNAgent(network=q_func, observation_space=env.observation_space, action_space=env.action_space, action_selection=action_selection, loss=loss, update_target=update_target, memory=memory, n_step=1, batch_size=100, discount_factor=0.99, replay_period=1, replays_per_step=5, update_with_replay=True, update_target_network_frequency=1) agent.train(env, num_timesteps=num_steps, render=False) ##agent.save('/tmp/save_data_new/naf')
env.seed(hyper_params['seed']) #env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) #env = EpisodicLifeEnv(env) #env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 3) replay_buffer = ReplayBuffer(hyper_params['replay_buffer_size']) agent = DQNAgent(env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params['use_double_dqn'], lr=hyper_params['learning_rate'], batch_size=hyper_params['batch_size'], gamma=hyper_params['discount_factor']) eps_timesteps = hyper_params['eps_fraction'] * float( hyper_params['num_steps']) episode_rewards = [0.0] loss = [0.0] policy_actions = unpickle_object('action_map') state = env.reset() for t in range(hyper_params['num_steps']): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params['eps_start'] + fraction * ( hyper_params['eps_end'] - hyper_params['eps_start'])
import gym import numpy as np from dqn.agent import DQNAgent from dqn.agent import EPISODES, EPISODE_LENGTH environment_name = 'CartPole-v1' environment = gym.make(environment_name) environment.max_episode_steps = EPISODE_LENGTH n_actions = environment.action_space.n n_state_features = environment.observation_space.shape[0] # Initialize DQN agent agent = DQNAgent(n_state_features, n_actions, epsilon=0.0) # Load pre-trained agent agent.load(f'./models/{environment_name}.h5') for episode in range(EPISODES): state = environment.reset() state = np.reshape(state, [1, n_state_features]) for t in range(EPISODE_LENGTH): # Visualize environment environment.render() # Predict next action using NN Value Function Approximation action = agent.get_action(state)