def main(): env = gym.make("LunarLander-v2") agent = Agent(env) agent.load_state_dict(torch.load("./models/agent.pt")) agent.eval() obs = env.reset() done = False for i in range(10000): env.render() obs = torch.from_numpy(obs).float() action, _, _ = agent.get_action(obs) obs, rew, done, info = env.step(action.cpu().numpy()) sleep(0.001) if done: obs = env.reset()
from gym import spaces import cv2 cv2.ocl.setUseOpenCL(False) import torch import torch.nn as nn import torch.optim as optim import numpy as np import gym from model import Agent device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = gym.make("LunarLander-v2") expert = Agent(env) expert.load_state_dict(torch.load("./models/agent.pt")) expert.eval() def generate_rollout(agent, env): agent.to("cpu") states = [] actions = [] rewards = [] obs = env.reset() steps = 0 while True: obs = torch.from_numpy(obs).float() states.append(obs) logits = agent.forward(obs) probs = torch.softmax(logits, dim=0) action = probs.argmax()
# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 next_obs = envs.reset() next_done = torch.zeros(args.num_envs).to(device) num_updates = args.total_timesteps // args.batch_size ## CRASH AND RESUME LOGIC: starting_update = 1 if args.prod_mode and wandb.run.resumed: print("previous run.summary", run.summary) starting_update = run.summary['charts/update'] + 1 global_step = starting_update * args.batch_size api = wandb.Api() run = api.run(run.get_url()[len("https://app.wandb.ai/"):]) model = run.file('agent.pt') model.download(f"models/{experiment_name}/") agent.load_state_dict(torch.load(f"models/{experiment_name}/agent.pt")) agent.eval() print(f"resumed at update {starting_update}") for update in range(starting_update, num_updates + 1): # Annealing the rate if instructed to do so. if args.anneal_lr: frac = 1.0 - (update - 1.0) / num_updates lrnow = lr(frac) optimizer.param_groups[0]['lr'] = lrnow # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(0, args.num_steps): envs.render() global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done