def __init__(self): sess = tf.Session() with sess: global_step = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) self.env = gym_env = gym.make(FLAGS.game) if FLAGS.gym_seed and FLAGS.gym_seed != -1: gym_env.seed(FLAGS.gym_seed) if FLAGS.monitor: gym_env = gym.wrappers.Monitor(gym_env, FLAGS.experiments_dir) env = AtariEnvironment( gym_env=gym_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) nb_actions = len(env.gym_actions) self.agent = DQNAgent(env, sess, nb_actions, global_step) self.saver = tf.train.Saver(max_to_keep=1000) if FLAGS.resume or not FLAGS.train: checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) ckpt = tf.train.get_checkpoint_state(checkpoint_dir) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) self.saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer())
def __init__(self, learner_type): self.agent = DQNAgent(100, 2, 0.3, 0.4) self.env = None # set the model for the learning engine depending on # learning type if learner_type == 'the_rival': self.agent.init_model('qnet') else: # placeholder: replace with more suitable model for mime if there exists self.agent.init_model('qnet')
def train(agent: DQNAgent, env: Env, episodes: int = 10_000): display = False progression = tqdm.trange(episodes, desc=f"Training {agent.name}", unit="episode") fps = 0 for episode in progression: state = env.reset() mean_reward = 0 return_ = 0 x_pos = 0 for step in count(1): t = time() action = agent.act(np.asarray(state), explore=True) next_state, reward, done, info = env.step(action) agent.memorize( Experience((state, next_state, action, done, reward))) state = next_state agent.learn() mean_reward += (reward - mean_reward) / step return_ += reward x_pos = max(x_pos, info["x_pos"]) fps = fps * 0.9 + 0.1 / (time() - t) if not step % 100: try: display = (yaml.load( (PROJECT_DIRECTORY / "display.yml").read_text()).get( agent.name, {}).get("display", False)) except: pass if display: env.render() if done or info["flag_get"]: break progression.set_description( f"Training {agent.name}; " f"Frames: {agent.step} ({fps:.0f} FPS); " f"last progression: {x_pos} ({x_pos/3260:.1%}); " f"eps: {agent.eps:.2f}") agent.register_episode( EpisodeMetrics(episode=episode, x_pos=x_pos, return_=return_, steps=step)) agent.save_model()
def create_agent(conf, action_space, observation_space): if conf['agent'] == "dqn": return DQNAgent( action_space, observation_space, batch_size=conf['batch_size'], learning_rate=conf['learning_rate'], discount=conf['discount'], epsilon=conf['random_explore']) elif conf['agent'] == "conv_dqn": return ConvDQNAgent( action_space, observation_space, batch_size=conf['batch_size'], learning_rate=conf['learning_rate'], discount=conf['discount'], epsilon=conf['random_explore']) elif conf['agent'] == "tabular_q": return TabularQAgent( action_space, observation_space, q_init=conf['q_value_init'], learning_rate=conf['learning_rate'], discount=conf['discount'], epsilon=conf['random_explore']) elif conf['agent'] == "random": return RandomAgent(action_space, observation_space) else: raise ArgumentError("Agent type [%s] is not supported." % conf['agent'])
def __init__(self, action_space, cmdl, is_training=True): DQNAgent.__init__(self, action_space, cmdl, is_training) self.name = "Categorical_agent" self.cmdl = cmdl hist_len, action_no = cmdl.hist_len, self.action_no self.policy = policy = get_model(cmdl.estimator, 1, hist_len, (action_no, cmdl.atoms_no), hidden_size=cmdl.hidden_size) self.target = target = get_model(cmdl.estimator, 1, hist_len, (action_no, cmdl.atoms_no), hidden_size=cmdl.hidden_size) if self.cmdl.cuda: self.policy.cuda() self.target.cuda() self.policy_evaluation = CategoricalPolicyEvaluation(policy, cmdl) self.policy_improvement = CategoricalPolicyImprovement( policy, target, cmdl)
def main(): env = UnityEnvironment( file_name= "/home/faten/projects/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64" ) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] state_size = len(state) agent = DQNAgent(state_size, action_size, seed=0) scores = train(env, agent) # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Epsiode #') plt.show() agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(3): state = env.reset() for j in range(200): action = agent.act(state) env.render() state, reward, done, _ = env.step(action) if done: break env.close()
def create(config): working_agent = config['GLOBAL']['working_agent'] if working_agent is None: return None if working_agent == 'DQNAgent': # maybe better to use the type() to get the name,but this is just ok . from agents.dqn_agent import DQNAgent # dynamic import. Refer to the Item 52: know how to break circular dependencey in book "Effective Python" return DQNAgent(config) if working_agent == 'REINFORCEAgent': from agents.reinforce_agent import REINFORCEAgent return REINFORCEAgent(config) if working_agent == 'ActorCriticAgent': from agents.actor_critic import ActorCriticAgent return ActorCriticAgent(config) return None
def get_agent(state_shape: tuple, action_size: int, model: torch.nn.Module, policy, memory, optimizer, params): # Agent agent = DQNAgent( state_shape=state_shape, action_size=action_size, model=model, policy=policy, batch_size=params['BATCH_SIZE'], update_frequency=params['UPDATE_FREQUENCY'], gamma=params['GAMMA'], lr_scheduler=DummyLRScheduler(optimizer), # Using adam optimizer=optimizer, memory=memory, seed=params['SEED'], tau=params['TAU'], action_repeats=params['ACTION_REPEATS'], ) return agent
def navigation_main(): env = UnityEnvironment(file_name="Navigation/Banana.app") seed = 777 np.random.seed(seed) seed_torch(seed) num_episode = 2000 memory_size = 10000 batch_size = 64 target_update = 4 epsilon_decay = 0.9 agent = DQNAgent(env, memory_size, batch_size, target_update, epsilon_decay) agent.train(num_episode) agent.test()
env = UnityEnvironment(file_name="./Banana_Linux/Banana.x86_64") min_solved = 13.0 # Get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state_size = len(env_info.vector_observations[0]) scores = [] test_scores = [] test_scores_i = [] avg_scores = [] scores_window = deque(maxlen=100) config = generate_configuration_qnet(action_size, state_size) agent = DQNAgent(config) agent.create_dirs() eps = config.eps_start for i_episode in range(1, config.n_episodes + 1): # Reset the environment and the score env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 while True: action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state, reward, done = env_info.vector_observations[ 0], env_info.rewards[0], env_info.local_done[0] agent.step(state, action, reward, next_state, done)
static_all=True, static_obj_rnd_pos=False, rnd_obj_rnd_pos=False, full_color=False) env.seed(args.seed) torch.manual_seed(args.seed) saved_action = namedtuple('saved_action', ['log_prob', 'value']) policy_net = DQNCnn(7) target_net = DQNCnn(7) memory = ReplayBuffer(100000, 4) agent = DQNAgent(memory, 64, 1., 1e-6, 0.05, 7, policy_net, target_net, 0.99, 0.001) e = 0 N_EXPLORE = 10 for i in tqdm(range(N_EXPLORE)): done = False s = env.reset() # To make this compatible with the ReplayBuffer, we need to expand the 3rd channel... s = np.expand_dims(s, 2) while not done: last_stored_frame_idx = agent.memory.store_frame(s) obs = agent.memory.encode_recent_observation() a = np.random.choice([x for x in range(7)])
# Import internal modules from agents.dqn_agent import DQNAgent # Import external modules import gym if __name__ == "__main__": # initialize gym environment and the agent env = gym.make('CartPole-v0') agent = DQNAgent(environment=env) # Test the agent agent.test()
model = build_q_network( input_shape=[ len(ini_observation)], nb_output=len(actions)) target_model = build_q_network( input_shape=[ len(ini_observation)], nb_output=len(actions)) agent = DQNAgent(actions=actions, memory=memory, update_interval=200, train_interval=1, batch_size=32, observation=ini_observation, model=model, target_model=target_model, policy=policy, loss_fn=loss_fn, optimizer=optimizer, obs_processor=obs_processor, is_ddqn=True) step_history = [] reward_history = [] nb_epsiodes = 1000 # for episode in range(nb_epsiodes):episode_reward_averague episode_reward_average = -1 with tqdm.trange(nb_epsiodes) as t: for episode in t: # agent.reset()
import argparse import copy import microgridRLsimulator import params.params as params from agents.random_agent import RandomAgent from agents.agent_ppo import PPOAgent from agents.dqn_agent import DQNAgent parser = argparse.ArgumentParser() parser.add_argument('--env', '-e', type=str, default='microgrid', choices=['microgrid', 'maze-dense', 'maze-sparse']) args = parser.parse_args() params = copy.deepcopy(params.params) params['env']['env'] = args.env params['env']['case'] = 'elespino_discrete' agent = DQNAgent(params) agent.train() # agent.test() agent.store_results(render_tr_te=2) print("End of agent's life")
import gym from agents.dqn_agent import experienceReplayBuffer_DQN, DQNAgent, QNetwork_DQN import torch from agents import evaluate from copy import deepcopy if __name__ == "__main__": n_iter = 100000 env = gym.make('gym_pvz:pvz-env-v2') nn_name = input("Save name: ") buffer = experienceReplayBuffer_DQN(memory_size=100000, burn_in=10000) net = QNetwork_DQN(env, device='cpu', use_zombienet=False, use_gridnet=False) # old_agent = torch.load("agents/benchmark/dfq5_znet_epslinear") # net.zombienet.load_state_dict(old_agent.zombienet.state_dict()) # for p in net.zombienet.parameters(): # p.requires_grad = False # net.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), # lr=net.learning_rate) agent = DQNAgent(env, net, buffer, n_iter=n_iter, batch_size=200) agent.train(max_episodes=n_iter, evaluate_frequency=5000, evaluate_n_iter=1000) torch.save(agent.network, nn_name) agent._save_training_data(nn_name)
BUFFER_SIZE = 10**4 EPISODES = 1001 EPISODE_LENGTH = 200 DISCOUNT = 0.99 LR = 1e-3 PRESAMPLE = 10 BATCH_SIZE = 128 C = 10 INTERVAL = 10 ## DQN env.reset() buffer = ReplayBuffer(BUFFER_SIZE) dqn_q_network = build_network(n_states, n_actions, 2, 200) agent = DQNAgent(dqn_q_network, DISCOUNT, LR) learner = DQNLearner(env, buffer, agent) dqn_hist = learner.train(presample=PRESAMPLE, batch_size=BATCH_SIZE, episodes=EPISODES, episode_length=EPISODE_LENGTH, interval=INTERVAL, C=C, save_path="./", save_name="DQN") ## Double DQN env.reset() buffer = ReplayBuffer(BUFFER_SIZE) double_q_network = build_network(n_states, n_actions, 2, 200)
state = next_state print() print("Results after {episodes} episodes:") print("Average reward per episode: {total_reward / episodes}") print("Average time steps per episode: {total_epochs / episodes}") # Load a Windy GridWorld environment env_name = "LunarLander-v2" env = gym.make(env_name) random_agent = RandomDQNAgent(env_name, env, 1000, is_state_box=True, memory_capacity=100000) # random_agent.train() agent = DQNAgent(env_name, env, 5000, learning_rate=0.00025, start_epsilon=1.0, discount_factor=0.99, decay_rate=0.0001, make_checkpoint=True, is_state_box=True, batch_size=64, memory_capacity=100000) # agent.memory = random_agent.memory # agent.train() weights, rewards, episode_len = agent.load("/home/dsalwala/NUIG/Thesis/rl-algos/data/LunarLander-v2_100.npy") stats = plotting.EpisodeStats( episode_lengths=episode_len, episode_rewards=rewards) # Search for a Q values # nn, stats = agent.nn.get_weights(), agent.stats nn = ANN(8, 4, 0.00025) nn.set_weights(weights) play_episode(env, nn, 1)
from acme.utils import loggers from acme.wrappers import gym_wrapper from agents.dqn_agent import DQNAgent from networks.models import Models from tensorflow.python.client import device_lib print(device_lib.list_local_devices()) def render(env): return env.environment.render(mode='rgb_array') environment = gym_wrapper.GymWrapper(gym.make('LunarLander-v2')) environment = wrappers.SinglePrecisionWrapper(environment) environment_spec = specs.make_environment_spec(environment) model = Models.sequential_model( input_shape=environment_spec.observations.shape, num_outputs=environment_spec.actions.num_values, hidden_layers=3, layer_size=300) agent = DQNAgent(environment_spec=environment_spec, network=model) logger = loggers.TerminalLogger(time_delta=10.) loop = acme.EnvironmentLoop(environment=environment, actor=agent) loop.run()
def train(): """ Trains a DQN agent in the Unity Banana environment. """ # set hyperparameters # # udacity dqn baseline: solved after 487 steps # buffer_size = int(1e5) # batch_size = 64 # gamma = 0.99 # tau = 1e-3 # learning_rate = 5e-4 # eps_start = 1.0 # eps_end = 0.01 # eps_decay = 0.995 # fc1_units = 64 # fc2_units = 64 # q_function_update_fraction=4 # seed = 0 # # larger network in 1st layer # buffer_size = int(1e5) # batch_size = 64 # gamma = 0.99 # tau = 1e-3 # learning_rate = 5e-4 # eps_start = 1.0 # eps_end = 0.01 # eps_decay = 0.995 # fc1_units = 128 # fc2_units = 64 # q_function_update_fraction=4 # seed = 0 # # # smaller network in 1st and 2nd layer # buffer_size = int(1e5) # batch_size = 64 # gamma = 0.99 # tau = 1e-3 # learning_rate = 5e-4 # eps_start = 1.0 # eps_end = 0.01 # eps_decay = 0.995 # fc1_units = 32 # fc2_units = 16 # q_function_update_fraction=4 # seed = 0 # # higher discount rate # buffer_size = int(1e5) # batch_size = 64 # gamma = 0.9999 # tau = 1e-3 # learning_rate = 5e-4 # eps_start = 1.0 # eps_end = 0.01 # eps_decay = 0.995 # fc1_units = 64 # fc2_units = 64 # q_function_update_fraction=4 # seed = 0 # # higher eps. decay rate # buffer_size = int(1e5) # batch_size = 64 # gamma = 0.99 # tau = 1e-3 # learning_rate = 5e-4 # eps_start = 1.0 # eps_end = 0.01 # eps_decay = 0.999 # fc1_units = 64 # fc2_units = 64 # q_function_update_fraction=4 # seed = 0 # higher eps. decay rate buffer_size = int(1e5) batch_size = 64 gamma = 0.99 tau = 1e-3 learning_rate = 5e-4 eps_start = 1.0 eps_end = 0.01 eps_decay = 0.990 fc1_units = 64 fc2_units = 64 q_function_update_fraction = 4 seed = 0 # use a simple concatenation of all hyperparameters as the experiment name. results are stored in a subfolder # with this name experiment_name = "6-smaller_eps_decay-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}".format( buffer_size, batch_size, gamma, tau, learning_rate, eps_start, eps_end, eps_decay, fc1_units, fc2_units, q_function_update_fraction, seed) # in addition to creating the experiment folder, create subfolders for checkpoints and logs if not os.path.isdir(experiment_name): os.mkdir(experiment_name) os.mkdir(experiment_name + '/checkpoints') os.mkdir(experiment_name + '/logs') # log the hyperparameters with open(experiment_name + '/logs/' + 'hyperparameters.log', 'w') as f: print( "Buffer size {}\nbatch size {}\ngamma {}\ntau {}\nlearning_rate {}\nfc1-fc2 {}-{}\nq-function_update_fraction {}\nseed {}" .format(buffer_size, batch_size, gamma, tau, learning_rate, fc1_units, fc2_units, q_function_update_fraction, seed), file=f) ############ THE ENVIRONMENT ############### env = UnityEnvironment(file_name='Banana_Linux/Banana.x86_64', seed=seed) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # get the number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # get the size of the action space action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) dqn_agent = DQNAgent(name=experiment_name, state_size=state_size, action_size=action_size, learning_rate=learning_rate, discount_rate=gamma, eps_start=eps_start, eps_end=eps_end, eps_decay=eps_decay, tau=tau, network_architecture=[fc1_units, fc2_units], experience_replay_buffer_size=buffer_size, experience_replay_buffer_batch_size=batch_size, experience_replay_start_size=3200, q_function_update_fraction=q_function_update_fraction, device='gpu', seed=seed) # run the train loop scores_all = train_loop(env=env, brain_name=brain_name, agent=dqn_agent, experiment_name=experiment_name) pickle.dump(scores_all, open(experiment_name + '/scores_all.pkg', 'wb')) # plot the results fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores_all) + 1), scores_all) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() # finally, close the environment env.close()
next_state, reward, done = process_state( env_info.visual_observations[0] ), env_info.rewards[0], env_info.local_done[0] state_window.append(next_state) state = np.vstack( [np.expand_dims(np.array(s), 0) for s in state_window]) score += reward if done: break scores_window.append(score) scores.append(score) print('\rTest Episode {}\tLast Score: {:.2f}\tAverage Score: {:.2f}'. format(i_episode, score, np.mean(scores_window)), end="") print('\rTest after {} episode mean {:.2f}'.format(n_ep_train, np.mean(scores_window))) return np.mean(scores_window) if __name__ == '__main__': env = UnityEnvironment(file_name="./Banana_Linux/Banana.x86_64") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state_size = len(env_info.vector_observations[0]) config = generate_configuration_qnet_visual(action_size, state_size) agent = DQNAgent(config) agent.load_weights("./checkpoint.pth") print(test(env, agent, 0, n_episodes=100, sleep_t=0))
""" Use this script to train the Double DQN agent that takes the segmented frames as input. This script will take about a day to run, depending on your computer performances. Logs and networks will be saved in /exp/rl At any point, you can visualize the agent playing by turning the `dqn-segm` (or the name of the model) field to True in the file /display.yml """ from agents.dqn_agent import DQNAgent from agents.training import train from environment.env import make_environment_for_dqn_with_segm from utils.reproductibility import seed_all if __name__ == "__main__": e = make_environment_for_dqn_with_segm() seed_all(e) a = DQNAgent(e, "dqn-segm") train(a, e)
class LearningEngine: def __init__(self, learner_type): self.agent = DQNAgent(100, 2, 0.3, 0.4) self.env = None # set the model for the learning engine depending on # learning type if learner_type == 'the_rival': self.agent.init_model('qnet') else: # placeholder: replace with more suitable model for mime if there exists self.agent.init_model('qnet') def init_game(self, width, height, obstacles): self.env = GameEnv(width, height, obstacles) self.env.reset() def train_mime(self, opponent, move_history, nb_episodes, delimiter=':'): action_ls = ['38', '40', '37', '39'] for episode_idx in range(nb_episodes): # logger.info(episode_idx, extra={ 'tags': ['dev_mssg: episode_idx'] }) self.env.reset() # get initial game state by enacting first user move user_action = move_history[0].split(delimiter)[-1] state, reward, done, _ = self.env.step( action_ls.index(user_action), 1) max_plies = 30 # play the game for step_idx in count(1): # break out of game if too many turns and no one has won if step_idx > max_plies: break expected_action = move_history[step_idx].split(delimiter)[-1] expected_action = action_ls.index(expected_action) # logger.info(step_idx, extra={ 'tags': ['dev_mssg: step_idx'] }) if step_idx % 2 != 0: next_state, _, done, _ = self.env.step( expected_action, (step_idx % 2) + 1) else: # select an action and then perform it action = self.agent.select_action(state) next_state, reward, done, _ = self.env.step( action[0, 0], (step_idx % 2) + 1, expected_action) # Perform one step of the optimization (on the target network) self.agent.optimize() # Store the transition in memory self.agent.memory.remember(state, action, next_state, reward) # Move to the next state state = next_state if done: break def train_agent(self, opponent, nb_episodes, plot_performance=False): avg_reward_ls = [] for episode_idx in range(nb_episodes): # logger.info(episode_idx, extra={ 'tags': ['dev_mssg: episode_idx'] }) self.env.reset() state, reward, done, _ = self.env.step(0, 1) total_reward, num_plies, max_plies = 0.0, 0, 30 # play the game for step_idx in count(1): num_plies = step_idx # break out of game if too many turns and no one has won if step_idx > max_plies: break # logger.info(step_idx, extra={ 'tags': ['dev_mssg: step_idx'] }) # set current player based on turn current_agent = self.agent if step_idx % 2 == 0 else opponent # select an action and then perform it action = current_agent.select_action(state) next_state, reward, done, _ = self.env.step( action[0, 0], (step_idx % 2) + 1) total_reward += reward # Store the transition in memory current_agent.memory.remember(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) current_agent.optimize() if done: break avg_reward_ls.append(total_reward / num_plies) # TODO: plot training performance if plot_performance: img = io.BytesIO() _, ax = plt.subplots() sns.tsplot(time=list(range(nb_episodes)), data=avg_reward_ls, condition='Training Loss', legend='True', ax=ax) ax.set_xlabel('Games') ay.set_ylabel('Average Reward') plt.savefig(img, format='png') img.seek(0) return base64.b64encode(img.getvalue()).decode() else: return None
def test(): # set hyperparameters (not really important for running the agent) # higher eps. decay rate buffer_size = int(1e5) batch_size = 64 gamma = 0.99 tau = 1e-3 learning_rate = 5e-4 eps_start = 1.0 eps_end = 0.01 eps_decay = 0.999 fc1_units = 64 fc2_units = 64 q_function_update_fraction = 4 seed = 0 ############ THE ENVIRONMENT ############### env = UnityEnvironment(file_name='Banana_Linux/Banana.x86_64', seed=seed) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # get the number of agents num_agents = len(env_info.agents) # get the size of the action space action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[1] # initialize agent dqn_agent = DQNAgent(name=None, state_size=state_size, action_size=action_size, learning_rate=learning_rate, discount_rate=gamma, eps_start=eps_start, eps_end=eps_end, eps_decay=eps_decay, tau=tau, network_architecture=[fc1_units, fc2_units], experience_replay_buffer_size=buffer_size, experience_replay_buffer_batch_size=batch_size, experience_replay_start_size=3200, q_function_update_fraction=q_function_update_fraction, device='gpu', seed=seed) dqn_agent.load_state_dict(torch.load('checkpoint.pth')) env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) for i in range(200): actions = dqn_agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards states = next_states if np.any(dones): break
# Import internal modules from agents.dqn_agent import DQNAgent # Import external modules import gym if __name__ == "__main__": # initialize gym environment and the agent env = gym.make('CartPole-v0') agent = DQNAgent(environment=env) # Start the game agent.run()
import gym import numpy as np from agents.dqn_agent import DQNAgent env = gym.make("LunarLander-v2") env.seed(0) agent = DQNAgent(env.action_space.n, env.observation_space.shape[0]) episodes = 400 steps = 3000 loss = [] for i_episode in range(episodes): obv = np.reshape(env.reset(), (1, 8)) total_reward = 0 done = False for t in range(steps): # env.render() # print(observation) action = agent.act(obv, total_reward, done) next_obv, reward, done, info = env.step(action) next_obv = np.reshape(next_obv, (1, 8)) total_reward += reward agent.store_transition(obv, action, reward, next_obv, done) obv = next_obv agent.replay() if done: print("{}/{}, reward: {} in {} timesteps".format( i_episode, episodes, total_reward, t + 1)) break loss.append(total_reward) # Average score of last 100 episode