def evaluate(agent: DQNAgent, n_epoch=10, render=False): """ evaluate the agent :param agent: agent to be evaluated :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is :param render: if you want to visualize the evaluation :return: score of the evaluation """ env = gym.make('LunarLander-v2') score = [] for e in range(n_epoch): done = False state = env.reset() epoch_reward = 0 step = 1 while not done and not (step % 1000 == 0): step += 1 if render: env.render() action_dist = agent.get_q(preprocess_state(state)) action = agent.select_action(action_dist) next_state, reward, done, info = env.step(action) epoch_reward += reward state = next_state print("episode {}/{} , reward: {}".format(e, n_epoch, epoch_reward)) score.append(epoch_reward) score = np.mean(score) return score
def learn_on_mini_batch(e, actor: Actor, critic: DQNAgent, critic_target: DQNAgent, exp_replay: ExperienceReplay, config=dense_config): batch_size = FLAGS.batch_size mini_batch = exp_replay.getMiniBatch(batch_size) state_batch, action_batch, reward_batch, dones_batch, next_state_batch = [], [], [], [], [] for exp in mini_batch: state_batch.append(exp.state) action_batch.append(exp.action) reward_batch.append(exp.reward) dones_batch.append(exp.done) if dones_batch[-1]: next_state_batch.append( exp.state ) # this is just to prevent nope, the terminal states are masked anyway else: next_state_batch.append(exp.next_state) Actor_Y_Batch = np.zeros((mini_batch.__len__(), actor.output_size[-1])) Critic_Y_Batch = np.zeros((mini_batch.__len__(), 1)) critic_batch_output_for_state = critic_target.get_q(state_batch) critic_batch_output_for_next_state = critic_target.get_q(next_state_batch) for i, reward in enumerate(reward_batch): # iteration over batch_size target = reward + ( critic.gamma * np.max(critic_batch_output_for_next_state[i])) * ( 1 - dones_batch[i]) # create target_value --> scalar Critic_Y_Batch[ i] = target # target_batch (target.get_q(state_batch))[i][action[i]) = target Actor_Y_Batch[i][ action_batch[i]] = target - critic_batch_output_for_state[i] # Q(s,a) - V(s) = Advantage for stability critic.learn(target_batch=Critic_Y_Batch, learning_rate=config.learning_rate_schedule_critic(e), input=state_batch) actor.learn(target_batch=Actor_Y_Batch, learning_rate=config.learning_rate_schedule_actor(e), input=state_batch)
def evaluate(agent: DQNAgent, n_epoch=10, render=False, verbose=False, record=False, video_path=None): """ evaluate the agent :param agent: agent to be evaluated :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is :param render: if you want to visualize the evaluation :return: score of the evaluation """ env = gym.make('MsPacmanDeterministic-v4') if record: video_save_location = "./vid" if not video_path else video_path env = gym.wrappers.Monitor(env, video_save_location, video_callable=lambda episode_id: True, force=True) final_score = [] for e in range(n_epoch): state = init_state() observation = env.reset() observation = process_observation(observation) done = False epoch_reward = 0.0 while not done: state = append_frame(state, observation) if render: env.render() q_values = agent.get_q(state=np.expand_dims(state, axis=0)) action = agent.select_action(qValues=q_values, explore=False) next_observation, reward, done, _ = env.step(action) next_observation = process_observation(next_observation) observation = next_observation epoch_reward += reward if verbose: print("Episode ", e, " / {} finished with reward {}".format(n_epoch, epoch_reward)) final_score.append(epoch_reward) final_score = np.mean(final_score) try: del env except ImportError: pass return final_score
def evaluate(agent: DQNAgent, n_epoch=10, render=False): """ evaluate the agent :param agent: agent to be evaluated :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is :param render: if you want to visualize the evaluation :return: score of the evaluation """ env = gym.make("PongNoFrameskip-v4") env = wrap_deepmind(env, frame_stack=True) final_score = [] for e in range(n_epoch): state = env.reset() state = np.asarray(state) done = False epoch_reward = 0.0 while not done: if render: env.render() q_values = agent.get_q( state=np.reshape(state, (1, state.shape[0], state.shape[1], state.shape[2]))) action = agent.select_action(qValues=q_values, explore=False) next_state, reward, done, _ = env.step(action + 1) # 1 for up 2 for stay 3 for down, action is from 0 to 2 so we need an offset next_state = np.asarray(next_state) state = next_state epoch_reward += reward print("Episode ", e, " / {} finished with reward {}".format(n_epoch, epoch_reward)) final_score.append(epoch_reward) final_score = np.mean(final_score) try: del env except ImportError: pass return final_score
def policy_distilliation_batch_train(exp_replay, student: DQNAgent, learning_rate=1.0e-4, config=student_config, use_per=False, e=None): """ train the student on a batch of experiences :param student: student to be trained on a batch of experiences :param exp_replay: the Experience replay :param learning_rate: learning rate for the SGD :param config: config where batch size and the output size is described :param use_per: if True use Prioritized supervised experience replay :return: loss fn value """ if not use_per: mini_batch = exp_replay.getMiniBatch(batch_size=config.batch_size) weights, indexes = np.ones( np.shape(mini_batch)[0], config.output_size), None else: mini_batch, weights, indexes = exp_replay.getMiniBatch( batch_size=config.batch_size, beta=config.beta_schedule(beta0=config.BETA0_PER, e=e, n_epoch=config.n_epochs)) state = [exp.state for exp in mini_batch] target = [exp.label for exp in mini_batch] target = np.squeeze(target) loss, td_errors = student.learn(target_batch=target, input=state, learning_rate=learning_rate, weights=weights) if use_per: td_errors_maximum = np.max(td_errors) td_errors *= np.ones_like(td_errors) * (td_errors_maximum**-1) action_batch = [exp.action for exp in mini_batch] new_priority = np.abs( td_errors ) + config.EPS_PER # we add epsilon so that every transaction has a chance new_priority = [ priority[action_batch[i]] for i, priority in enumerate(new_priority) ] exp_replay.update_priorities(indexes=indexes, priorities=new_priority) return loss
plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(), interpolation='none') BATCH_SIZE = 32 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 init_screen = get_screen() _, _, screen_height, screen_width = init_screen.shape n_actions = env.action_space.n policyNet = DQNAgent(screen_height, screen_width, n_actions).to(device) targetNet = DQNAgent(screen_height, screen_width, n_actions).to(device) targetNet.load_state_dict(policyNet.state_dict( )) # Use the parameters of policyNet to evaluate targetNet targetNet.eval() optimizer = optim.RMSprop(policyNet.parameters()) memory = ReplayMemory(10000) steps_done = 0 def select_action(state): global steps_done sample = random.random()
from model import DQNAgent import gym import numpy as np # Number of games for the agent to train on episodes = 1000 # initialize gym environment and the agent env = gym.make('CartPole-v0') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) agent.build_model() # Iterate the game for e in range(episodes): # reset state in the beginning of each game state = env.reset() state = np.reshape(state, [1, 4]) # time_t represents each frame of the game # Our goal is to keep the pole upright as long as possible until score of 500 # the more time_t the more score for time_t in range(500): # turn this on if you want to render env.render() # Decide action action = agent.act(state) # Advance the game to the next frame based on the action. # Reward is 1 for every frame the pole survived
start = time.time() writer = SummaryWriter() # Hyper-parameters BATCH_SIZE = 512 MEMORY_SIZE = 5000 LR = 0.001 test_interval = 1000 test_episodes = 100 TIMESTEPS = 10000 EPSILON_ENDT = 3000 env = gym.make('CartPole-v0') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = DQNAgent(d_actions=env.action_space.n, device=device, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, lr=LR, epsilon_endt=EPSILON_ENDT) agent.policy_net = MLPPolicy(d_state=env.observation_space.shape[0], d_hidden=20, d_action=env.action_space.n).to(device) init = time.time() print("Init time {}".format(init-start)) num_episode = 0 episode_t = 0 state = env.reset() state = torch.from_numpy(state).unsqueeze_(0).to(device=device, dtype=torch.float) while agent.time_step < TIMESTEPS: action = agent.act(state) next_state, reward, done, _ = env.step(action.item()) episode_t += 1
from model import DQNAgent import time from tensorboardX import SummaryWriter EPSILON_START = 1.0 EPSILON_FINAL = 0.1 EPSILON_DECAY = 250000 EPISODES = 5000 epsilon_by_frame = lambda step_idx: EPSILON_FINAL + ( EPSILON_START - EPSILON_FINAL) * math.exp(-1. * step_idx / EPSILON_DECAY) writer = SummaryWriter(comment='DQN') num_frames = 0 env = gym.make('Riverraid-v0') agent = DQNAgent(env) is_render = False for i_episode in range(EPISODES): score = 0 observation = env.reset() observation = WarpFrame(observation) observation = np.stack([observation] * 4, axis=0) done = False #is_render = i_episode % 10 == 0 t = time.time() loss = [] while not done: if is_render: env.render() #print(observation.shape) """
# In[12]: output_dir = 'model/cartpole' # In[13]: if not os.path.exists(output_dir): os.makedirs(output_dir) # In[15]: from model import DQNAgent # In[16]: agent = DQNAgent(state_size, action_size) # In[17]: agent.model.summary() # In[20]: done = False for e in range(n_episodes): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(5000): # env.render()
if options.nogui: # if True: sumoBinary = checkBinary('sumo') else: sumoBinary = checkBinary('sumo-gui') sumoInt.routeFileGenerator() # Main logic # parameters episodes = 100 batch_size = 100 green_duration = 10 yellow_duration = 6 agentGenerator = DQNAgent() try: agentGenerator.load('Models/reinf_traf_control.h5') except: print('No models found') for e in range(episodes): # DNN Agent # Initialize DNN with random weights # Initialize target network with same weights as DNN Network #log = open('log.txt', 'a') step = 0 haltTime = 0 reward1 = 0 reward2 = 0 netReward = 0.9 * (reward1 - reward2)