def eval(model_type=model_type, model_path=model_path): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' env = LunarLander() if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() episodes = 50 wins = 0 frames = [] fuel_left = [] for i in range(episodes): if i % 10 == 0: print(f"On episode {i}") frame_count = 0 env.reset() state = env.get_state() while True: frame_count += 1 action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action) if done: if env.won: wins += 1 frames.append(frame_count) fuel_left.append(env.rocket.fuel) break env.close() if wins > 0: print(f"wins: {wins}") print(f"mean frames on wins {np.mean(frames)}") print(f"std frames on wins {np.std(frames, ddof=1)}") print(f"min frames on wins {np.min(frames)}") print(f"max frames on wins {np.max(frames)}") print(f"mean fuel on wins {np.mean(fuel_left)}") print(f"std fuel on wins {np.std(fuel_left, ddof=1)}") print(f"min fuel on wins {np.min(fuel_left)}") print(f"max fuel on wins {np.max(fuel_left)}") else: print("The model had 0 wins. Statistics can't be calculated")
env = LunarLander() env.reset() exit_program = False if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() state = env.get_state() while not exit_program: env.render() action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action) # Process game events for event in pygame.event.get(): if event.type == pygame.QUIT: exit_program = True if event.type == pygame.KEYDOWN:
class Learner: def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"): self.FILE = FILE self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.policy = Policy().to(self.device) self.policy.load_state_dict(torch.load(self.FILE)) self.policy.eval() self.criterion = nn.CrossEntropyLoss() self.learning_rate = learning_rate self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.learning_rate) def simulate(self, episode: int, policyPercent: float, show=False): """ Simulate the cartpole process :param episode: number of episode want to simulate, how many percentage of policy want to use :return: list of ([trajectory of actions], [trajectory of observation], totalReward) """ env = gym.make('CartPole-v0') result = [] for i_episode in range(episode): actions = [] observations = [] totalReward = 500 # if not failed observation = env.reset() for t in range(500): if show: env.render() observationTensor = torch.from_numpy( observation) # convert from numpy to tensor observationTensor = torch.tensor(observationTensor, dtype=torch.float32) observationTensor = observationTensor.to(self.device) observations.append(observation.tolist()) if random.random( ) <= policyPercent: # policy mix with random choice with torch.no_grad(): action = torch.max(self.policy(observationTensor), 0)[1].item() # 0 or 1 else: action = random.randint(0, 1) actions.append(action) observation, reward, done, info = env.step(action) if done: totalReward = t + 1 # print(f"Episode finished after {t + 1} timesteps") break result.append((actions, observations, totalReward)) env.close() return result def trainPolicy(self, episodes, policyPercent=0.8): """ Train the policy """ # First play serval times to determine the average reward. trajectoriesForAvgRwd = self.simulate(20, 1) averageReward = sum([i[2] for i in trajectoriesForAvgRwd ]) / len(trajectoriesForAvgRwd) print(averageReward) trajectoriesForTrain = self.simulate(episodes, policyPercent) for trainTrajectory in trajectoriesForTrain: if trainTrajectory[2] > averageReward: # forward predictAction = self.policy( torch.tensor(trainTrajectory[1]).to(self.device)) loss = self.criterion( predictAction, torch.tensor(trainTrajectory[0]).to(self.device)) # backwards self.optimizer.zero_grad() loss.backward() self.optimizer.step() torch.save(self.policy.state_dict(), self.FILE)