Beispiel #1
0
def main():
    # Create carpole environment and network
    env = gym.make('CartPole-v0').unwrapped
    if not os.path.exists(model_path):
        raise Exception("You should train the DQN first!")
    net = DQN(n_state=env.observation_space.shape[0],
              n_action=env.action_space.n,
              epsilon=epsilon,
              batch_size=batch_size,
              model_path=model_path)
    net.load()
    net.cuda()
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        while True:
            # env.render()

            # Select action and obtain the reward
            a = net.chooseAction(s)
            s_, r, finish, _ = env.step(a)

            total_reward += r
            if finish:
                print("Episode: %d \t Total reward: %d \t Eps: %f" %
                      (i, total_reward, net.epsilon))
                reward_list.append(total_reward)
                break
            s = s_
    env.close()
    print("Testing average reward: ", np.mean(reward_list))
Beispiel #2
0
def load_and_test(opt):
    netp1 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt)
    netp1.load(opt.load_path)
    load_path2 = list(opt.load_path)
    print(opt.load_path)
    load_path2[-11] = '2'
    load_path2 = "".join(load_path2)
    print(load_path2)
    netp2 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt)
    netp2.load(load_path2)
    if opt.player == 1:
        r1, r2, w, d = test_ep_pvp(netp1,
                                   netp2,
                                   opt.num_test,
                                   opt.eps,
                                   render=opt.render)

        print('p1 average reward:', r1)
        print('p2 average reward:', r2)
        print('p1 win rate:', w)
        print('p2 win rate:', 1 - w - d)
        print('draw rate:', d)

    elif opt.player == 2:
        r2, r1, w, d = test_ep_pvp(netp2,
                                   netp1,
                                   opt.num_test,
                                   opt.eps,
                                   render=opt.render)

        print('p1 average reward:', r1)
        print('p2 average reward:', r2)
        print('p1 win rate:', 1 - w - d)
        print('p2 win rate:', w)
        print('draw rate:', d)
Beispiel #3
0
def load_and_play(opt):
    net = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt)
    net.load(opt.load_path)
    if opt.player == 1:
        r, w = test_ep(net,
                       opt.opp_policy,
                       opt.num_test,
                       opt.eps,
                       render=opt.render)

    elif opt.player == 2:
        r, w = test_ep_p2(net,
                          opt.opp_policy,
                          opt.num_test,
                          opt.eps,
                          render=opt.render)

    print('average reward: {:.3f}'.format(r))
    print('win rate: {:.1f}%'.format(100 * w))
Beispiel #4
0
def main():
    model = DQN(env.observation_space.shape, env.action_space.n)
    model.load(MODEL_FILENAME)

    while True:

        state = env.reset()
        state = rgb2dataset(state)

        # Transition
        transition = []
        transition.append(state)

        model.episode += 1
        accum_reward = 0

        while True:
            if len(transition) == 4:
                action = model.get_action(transition, is_random=False)
            else:
                action = model.get_action(transition, is_random=True)

            state_, reward, done, info = env.step(action)
            state_ = rgb2dataset(state_)

            accum_reward += reward
            state = state_

            # Transition
            transition.append(state)
            if len(transition) > 4:
                transition.pop(0)

            if RENDER:
                env.render()

            if done:
                print("accum_reward : %7d" % (accum_reward))
                break

    env.close()
class Agent:
	"""
	The intelligent agent of the simulation. Set the model of the neural network used and general parameters.
	It is responsible to select the actions, optimize the neural network and manage the models.
	"""

	def __init__(self, action_set, train=True, load_path=None):
		#1. Initialize agent params
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		self.action_set = action_set
		self.action_number = len(action_set)
		self.steps_done = 0
		self.epsilon = Config.EPS_START
		self.episode_durations = []

		#2. Build networks
		self.policy_net = DQN().to(self.device)
		self.target_net = DQN().to(self.device)
		
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

		if not train:		
			self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)	
			self.policy_net.load(load_path, optimizer=self.optimizer)
			self.policy_net.eval()

		self.target_net.load_state_dict(self.policy_net.state_dict())
		self.target_net.eval()

		#3. Create Prioritized Experience Replay Memory
		self.memory = Memory(Config.MEMORY_SIZE)


	 
	def append_sample(self, state, action, next_state, reward):
		"""
		save sample (error,<s,a,s',r>) to the replay memory
		"""

		# Define if is the end of the simulation
		done = True if next_state is None else False

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
		state_action_values = self.policy_net(state)
		state_action_values = state_action_values.gather(1, action.view(-1,1))

		
		if not done:
			# Compute argmax Q(s', a; θ)		
			next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1)

			# Compute Q(s', argmax Q(s', a; θ), θ-)
			next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach()

			# Compute the expected Q values
			expected_state_action_values = (next_state_values * Config.GAMMA) + reward
		else:
			expected_state_action_values = reward


		error = abs(state_action_values - expected_state_action_values).data.cpu().numpy()


		self.memory.add(error, state, action, next_state, reward)

	def select_action(self, state, train=True):
		"""
		Selet the best action according to the Q-values outputed from the neural network

		Parameters
		----------
			state: float ndarray
				The current state on the simulation
			train: bool
				Define if we are evaluating or trainning the model
		Returns
		-------
			a.max(1)[1]: int
				The action with the highest Q-value
			a.max(0): float
				The Q-value of the action taken
		"""
		global steps_done
		sample = random.random()
		#1. Perform a epsilon-greedy algorithm
		#a. set the value for epsilon
		self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
			math.exp(-1. * self.steps_done / Config.EPS_DECAY)
			
		self.steps_done += 1

		#b. make the decision for selecting a random action or selecting an action from the neural network
		if sample > self.epsilon or (not train):
			# select an action from the neural network
			with torch.no_grad():
				# a <- argmax Q(s, theta)
				a = self.policy_net(state)
				return a.max(1)[1].view(1, 1), a.max(0)
		else:
			# select a random action
			print('random action')
			return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None

	"""
	def select_action(self, state, train=True):
		
		Selet the best action according to the Q-values outputed from the neural network

		Parameters
		----------
			state: float ndarray
				The current state on the simulation
			train: bool
				Define if we are evaluating or trainning the model
		Returns
		-------
			a.max(1)[1]: int
				The action with the highest Q-value
			a.max(0): float
				The Q-value of the action taken
		
		global steps_done
		sample = random.random()
		#1. Perform a epsilon-greedy algorithm
		#a. set the value for epsilon
		self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
			math.exp(-1. * self.steps_done / Config.EPS_DECAY)
			
		self.steps_done += 1

		#b. make the decision for selecting a random action or selecting an action from the neural network
		if sample > self.epsilon or (not train):
			# select an action from the neural network
			with torch.no_grad():
				# a <- argmax Q(s, theta)
				#set the network to train mode is important to enable dropout
				self.policy_net.train()
				output_list = []
				# Retrieve the outputs from neural network feedfoward n times to build a statistic model
				for i in range(Config.STOCHASTIC_PASSES):
					#print(agent.policy_net(data))
					output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0))
					#print(output_list[i])

				self.policy_net.eval()
				# The result of the network is the mean of n passes
				output_mean = torch.cat(output_list, 0).mean(0)
				q_value = output_mean.data.cpu().numpy().max()
				action = output_mean.max(1)[1].view(1, 1)

				uncertainty = torch.cat(output_list, 0).var(0).mean().item()
				
				return action, q_value, uncertainty
				
		else:
			# select a random action
			print('random action')
			return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None

	"""
	def optimize_model(self):
		"""
		Perform one step of optimization on the neural network
		"""

		if self.memory.tree.n_entries < Config.BATCH_SIZE:
			return
		transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE)

		# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
											  batch.next_state)), device=self.device, dtype=torch.uint8)
		non_final_next_states = torch.cat([s for s in batch.next_state
													if s is not None])
		
		state_batch = torch.cat(batch.state)
		action_batch = torch.cat(batch.action)
		reward_batch = torch.cat(batch.reward)
		
		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
		state_action_values = self.policy_net(state_batch).gather(1, action_batch)
		
	
		# Compute argmax Q(s', a; θ)		
		next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1)

		# Compute Q(s', argmax Q(s', a; θ), θ-)
		next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device)
		next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach()

		# Compute the expected Q values
		expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch

		# Update priorities
		errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy()
		
		# update priority
		for i in range(Config.BATCH_SIZE):
			idx = idxs[i]
			self.memory.update(idx, errors[i])


		# Compute Huber loss
		loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
		loss_return = loss.item()

		# Optimize the model
		self.optimizer.zero_grad()
		loss.backward()
		for param in self.policy_net.parameters():
			param.grad.data.clamp_(-1, 1)
		self.optimizer.step()

		return loss_return

	def save(self, step, logs_path, label):
		"""
		Save the model on hard disc

		Parameters
		----------
			step: int
				current step on the simulation
			logs_path: string
				path to where we will store the model
			label: string
				label that will be used to store the model
		"""

		os.makedirs(logs_path + label, exist_ok=True)

		full_label = label + str(step) + '.pth'
		logs_path = os.path.join(logs_path, label, full_label)

		self.policy_net.save(logs_path, step=step, optimizer=self.optimizer)
	
	def restore(self, logs_path):
		"""
		Load the model from hard disc

		Parameters
		----------
			logs_path: string
				path to where we will store the model
		"""
		self.policy_net.load(logs_path)
		self.target_net.load(logs_path)
Beispiel #6
0
            if done:
                episode_score[-1] = score.get()
                score.reset()
                break

    mean_score = np.mean(episode_score)
    mean_reward = np.mean(episode_reward)
    n_episodes = len(episode_reward)

    return mean_score, mean_reward, n_episodes


score = LunarLanderScore()
reward = CustomReward() if use_custom_reward else None
env = arlie.make("LunarLander",
                 port=4000,
                 seed=seed,
                 render_mode=False,
                 reward=reward)
model = DQN.load("wave-trained-model")

print("Evaluating...")
mean_score, mean_reward, n_episodes = evaluate(env,
                                               model,
                                               score,
                                               num_episodes=eval_episodes)
print("Mean score: {}, reward: {}, in {} episodes".format(
    mean_score, mean_reward, n_episodes))

env.close()
class Agent:
    """
    The intelligent agent of the simulation. Set the model of the neural network used and general parameters.
    It is responsible to select the actions, optimize the neural network and manage the models.
    """

    def __init__(self, action_set, train=True, load_path=None):
        #1. Initialize agent params
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_set = action_set
        self.action_number = len(action_set)
        self.steps_done = 0
        self.epsilon = Config.EPS_START
        self.episode_durations = []

        print('LOAD PATH    --  agent.init:', load_path)
        time.sleep(2)

        #2. Build networks
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

        if not train:
            print('entrou no not train')        
            self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)    
            self.policy_net.load(load_path, optimizer=self.optimizer)
            self.policy_net.eval()

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.memory = ReplayMemory(1000)

        


    def select_action(self, state, train=True):
        """
        Selet the best action according to the Q-values outputed from the neural network

        Parameters
        ----------
            state: float ndarray
                The current state on the simulation
            train: bool
                Define if we are evaluating or trainning the model
        Returns
        -------
            a.max(1)[1]: int
                The action with the highest Q-value
            a.max(0): float
                The Q-value of the action taken
        """
        global steps_done
        sample = random.random()
        #1. Perform a epsilon-greedy algorithm
        #a. set the value for epsilon
        self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \
            math.exp(-1. * self.steps_done / Config.EPS_DECAY)
            
        self.steps_done += 1

        #b. make the decision for selecting a random action or selecting an action from the neural network
        if sample > self.epsilon or (not train):
            # select an action from the neural network
            with torch.no_grad():
                # a <- argmax Q(s, theta)
                a = self.policy_net(state)
                return a.max(1)[1].view(1, 1), a.max(0)
        else:
            # select a random action
            print('random action')
            return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None

    def optimize_model(self):
        """
        Perform one step of optimization on the neural network
        """

        if len(self.memory) < Config.BATCH_SIZE:
            return
        transitions = self.memory.sample(Config.BATCH_SIZE)

        # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.uint8)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        
    
        # Compute argmax Q(s', a; θ)        
        next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1)

        # Compute Q(s', argmax Q(s', a; θ), θ-)
        next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach()

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch


        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def save(self, step, logs_path, label):
        """
        Save the model on hard disc

        Parameters
        ----------
            step: int
                current step on the simulation
            logs_path: string
                path to where we will store the model
            label: string
                label that will be used to store the model
        """

        os.makedirs(logs_path + label, exist_ok=True)

        full_label = label + str(step) + '.pth'
        logs_path = os.path.join(logs_path, label, full_label)

        self.policy_net.save(logs_path, step=step, optimizer=self.optimizer)
    
    def restore(self, logs_path):
        """
        Load the model from hard disc

        Parameters
        ----------
            logs_path: string
                path to where we will store the model
        """
        self.policy_net.load(logs_path)
        self.target_net.load(logs_path)
Beispiel #8
0
class Agent:
    """
    Class representing a learning agent acting in an environment.
    """
    def __init__(self,
                 buffer_size,
                 batch_size,
                 alpha,
                 gamma,
                 epsilon,
                 epsilon_min,
                 epsilon_decay,
                 lr,
                 game="CartPole-v1",
                 mean_bound=5,
                 reward_bound=495.0,
                 sync_model=1000,
                 save_model=10):
        """
        Constructor of the agent class.
            - game="CartPole-v1" : Name of the game environment
            - mean_bound=5 : Number of last acquired rewards considered for mean reward
            - reward_bound=495.0 : Reward acquired for completing an episode properly
            - sync_model=1000 : Interval for synchronizing model and target model
            - save_model=10 : Interval for saving model

            - buffer_size : Replay buffer size of the DQN model
            - batch_size : Batch size of the DQN model
            - alpha : Learning rate for Q-Learning
            - gamma : Discount factor for Q-Learning
            - epsilon : Threshold for taking a random action
            - epsilon_min : Minimal value allowed for epsilon
            - epsilon_decay : Decay rate for epsilon
            - lr : Learning rate for the DQN model
        """

        # Environment variables
        self.game = game
        self.env = gym.make(self.game)
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n

        # Agent variables
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.mean_bound = mean_bound
        self.reward_bound = reward_bound

        # DQN variables
        self.lr = lr
        self.model = DQN(self.num_states, self.num_actions, self.lr)
        self.target_model = DQN(self.num_states, self.num_actions, self.lr)
        self.target_model.update(self.model)
        self.sync_model = sync_model
        self.save_model = save_model

        # File paths
        dirname = os.path.dirname(__file__)
        self.path_model = os.path.join(dirname, "../models/dqn.h5")
        self.path_plot = os.path.join(dirname, "../plots/dqn.png")

        # Load model, if it already exists
        try:
            self.model.load(self.path_model)
            self.target_model.update(self.model)
        except:
            print("Model does not exist! Create new model...")

    def reduce_epsilon(self):
        """
        Reduces the parameter epsilon up to a given minimal value where the speed of decay is controlled by some given parameter.
        """

        epsilon = self.epsilon * self.epsilon_decay

        if epsilon >= self.epsilon_min:
            self.epsilon = epsilon
        else:
            self.epsilon = self.epsilon_min

    def get_action(self, state):
        """
        Returns an action for a given state, based on the current policy.
            - state : Current state of the agent
        """

        if np.random.random() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(self.model.predict(state))

        return action

    def train(self, num_episodes, report_interval):
        """
        Trains the DQN model for a given number of episodes. Outputting report information is controlled by a given time interval.
            - num_episodes : Number of episodes to train
            - report_interval : Interval for outputting report information of training
        """

        step = 0
        total_rewards = []

        for episode in range(1, num_episodes + 1):
            if episode % self.save_model == 0:
                self.model.save(self.path_model)

            state = self.env.reset()
            state = state.reshape((1, self.num_states))
            total_reward = 0.0

            while True:
                step += 1

                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = next_state.reshape((1, self.num_states))

                # Penalize agent if pole could not be balanced until end of episode
                if done and reward < 499.0:
                    reward = -100.0

                self.buffer.remember(state, action, reward, next_state, done)
                self.replay()
                self.reduce_epsilon()

                state = next_state
                total_reward += reward

                if step % self.sync_model == 0:
                    self.target_model.update(self.model)

                if done:
                    total_reward += 100.0
                    total_rewards.append(total_reward)
                    mean_reward = np.mean(total_rewards[-self.mean_bound:])

                    if episode % report_interval == 0:
                        print(f"Episode: {episode}/{num_episodes}"
                              f"\tStep: {step}"
                              f"\tMemory Size: {len(self.memory)}"
                              f"\tEpsilon: {self.epsilon : .3f}"
                              f"\tReward: {total_reward}"
                              f"\tLast 5 Mean: {mean_reward : .2f}")

                        self.plot_rewards(total_rewards)

                    if mean_reward > self.reward_bound:
                        self.model.save(self.path_model)
                        return

                    break

        self.model.save(self.path_model)

    def replay(self):
        """
        Samples training data from the replay buffer and fits the DQN model.
        """

        sample_size, states, actions, rewards, next_states, dones = self.memory.sample(
        )

        q_values = self.model.predict(states)
        next_q_values = self.target_model.predict(next_states)

        for i in range(sample_size):
            action = actions[i]
            done = dones[i]

            if done:
                q_target = rewards[i]
            else:
                q_target = rewards[i] + self.gamma * np.max(next_q_values[i])

            q_values[i][action] = (1 - self.alpha) * \
                q_values[i][action] + self.alpha * q_target

        self.model.fit(states, q_values)

    def play(self, num_episodes):
        """
        Renders the trained agent for a given number of episodes.
            - num_episodes : Number of episodes to render
        """

        self.epsilon = self.epsilon_min

        for episode in range(1, num_episodes + 1):
            state = self.env.reset()
            state = state.reshape((1, self.num_states))
            total_reward = 0.0

            while True:
                self.env.render()
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = next_state.reshape((1, self.num_states))
                state = next_state
                total_reward += reward

                if done:
                    print(f"Episode: {episode}/{num_episodes}"
                          f"\tTotal Reward: {total_reward : .2f}")

                    break

    def plot_rewards(self, total_rewards):
        """
        Plots the rewards the agent has acquired during training.
            - total_rewards : Rewards the agent has gained per episode
        """

        x = range(len(total_rewards))
        y = total_rewards

        slope, intercept, _, _, _ = linregress(x, y)

        plt.plot(x, y, linewidth=0.8)
        plt.plot(x, slope * x + intercept, color="red", linestyle="-.")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.title("DQN-Learning")
        plt.savefig(self.path_plot)
Beispiel #9
0
from model import DQN
from rewards import CustomReward

wave = True
render_episodes = 7

if wave:
    import arlie

    env = arlie.make("LunarLander", reward=CustomReward())
else:
    import gym

    env = gym.make("LunarLander-v2")

model = DQN.load("{}-trained-model".format("wave" if wave else "gym"))

episode = render_episodes
reward_sum = 0
obs = np.reshape(env.reset(), (1, model.obs_size))
while episode > 0:
    action, _states = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    obs = np.reshape(obs, (1, model.obs_size))
    reward_sum += reward
    env.render()
    if done:
        print("Points: {}".format(reward_sum))
        episode -= 1
        reward_sum = 0
        obs = np.reshape(env.reset(), (1, model.obs_size))
if __name__ == '__main__':
    # Create carpole environment and network
    env = gym.make('CartPole-v0').unwrapped
    net = DQN(n_state=env.observation_space.shape[0],
              n_action=env.action_space.n,
              memory_size=memory_size,
              lr=lr,
              epsilon=epsilon,
              epsilon_decay=epsilon_decay,
              update_iter=update_iter,
              batch_size=batch_size,
              gamma=gamma,
              model_path=model_path)
    net.cuda()
    net.load()
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        while True:
            # env.render()
            # Select action and obtain the reward
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)

            # Record the total reward
            total_reward += r

            # Revised the reward
            if finish:
Beispiel #11
0
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='path of input test weight')
    parser.add_argument('--rounds', type=int, default=3, help='play x rounds')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--test_epsilon', default=0, type=float)
    args = parser.parse_args()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    env = gym.make('BreakoutNoFrameskip-v4')
    
    # frame stack and preprocessing
    env = AtariPreprocessing(env, noop_max=30, frame_skip=4)
    env = FrameStack(env, 4)

    model = DQN(env.observation_space.shape, env.action_space.n).to(device).eval()
    model.load(args.path, test=True)

    # play three rounds
    for i in range(args.rounds):
        done = False
        total_reward = 0
        state = env.reset()
        
        while not done:
            if args.render:
                env.render()
            action = model.select_action(state, args.test_epsilon, action_space)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            total_reward += reward
            if done: