envs = [common.make_env() for _ in range(common.NUM_ENVS)] test_env = common.make_env(test=True) if args.seed: common.set_seed(args.seed, envs, cuda=args.cuda) suffix = "-seed=%d" % args.seed else: suffix = "" writer = SummaryWriter(comment="-03_i2a_" + args.name + suffix) obs_shape = envs[0].observation_space.shape act_n = envs[0].action_space.n net_policy = common.AtariA2C(obs_shape, act_n).to(device) net_em = i2a.EnvironmentModel(obs_shape, act_n) net_em.load_state_dict(torch.load(args.em, map_location=lambda storage, loc: storage)) net_em = net_em.to(device) net_i2a = i2a.I2A(obs_shape, act_n, net_em, net_policy, ROLLOUTS_STEPS).to(device) print(net_i2a) obs = envs[0].reset() obs_v = ptan.agent.default_states_preprocessor([obs]).to(device) res = net_i2a(obs_v) optimizer = optim.RMSprop(net_i2a.parameters(), lr=LEARNING_RATE, eps=1e-5) policy_opt = optim.Adam(net_policy.parameters(), lr=POLICY_LR) step_idx = 0 total_steps = 0
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("-n", "--name", required=True, help="Name of the run") parser.add_argument("-m", "--model", required=True, help="File with model to load") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") saves_path = os.path.join("saves", "02_env_" + args.name) os.makedirs(saves_path, exist_ok=True) envs = [common.make_env() for _ in range(NUM_ENVS)] writer = SummaryWriter(comment="-02_env_" + args.name) net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n) net_em = i2a.EnvironmentModel(envs[0].observation_space.shape, envs[0].action_space.n).to(device) net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage)) net = net.to(device) print(net_em) optimizer = optim.Adam(net_em.parameters(), lr=LEARNING_RATE) step_idx = 0 best_loss = np.inf with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for mb_obs, mb_obs_next, mb_actions, mb_rewards, done_rewards, done_steps in iterate_batches(envs, net, device): if len(done_rewards) > 0: m_reward = np.mean(done_rewards) m_steps = np.mean(done_steps) print("%d: done %d episodes, mean reward=%.2f, steps=%.2f" % ( step_idx, len(done_rewards), m_reward, m_steps)) tb_tracker.track("total_reward", m_reward, step_idx)