os.makedirs(saves_path, exist_ok=True) envs = [common.make_env() for _ in range(common.NUM_ENVS)] test_env = common.make_env(test=True) if args.seed: common.set_seed(args.seed, envs, cuda=args.cuda) suffix = "-seed=%d" % args.seed else: suffix = "" writer = SummaryWriter(comment="-03_i2a_" + args.name + suffix) obs_shape = envs[0].observation_space.shape act_n = envs[0].action_space.n net_policy = common.AtariA2C(obs_shape, act_n).to(device) net_em = i2a.EnvironmentModel(obs_shape, act_n) net_em.load_state_dict( torch.load(args.em, map_location=lambda storage, loc: storage)) net_em = net_em.to(device) net_i2a = i2a.I2A(obs_shape, act_n, net_em, net_policy, ROLLOUTS_STEPS).to(device) print(net_i2a) obs = envs[0].reset() obs_v = ptan.agent.default_states_preprocessor([obs]).to(device) res = net_i2a(obs_v) optimizer = optim.RMSprop(net_i2a.parameters(), lr=LEARNING_RATE, eps=1e-5)
device = torch.device("cuda" if args.cuda else "cpu") saves_path = os.path.join("saves", "01_a2c_" + args.name) os.makedirs(saves_path, exist_ok=True) envs = [common.make_env() for _ in range(common.NUM_ENVS)] if args.seed: common.set_seed(args.seed, envs, cuda=args.cuda) suffix = "-seed=%d" % args.seed else: suffix = "" test_env = common.make_env(test=True) writer = SummaryWriter(comment="-01_a2c_" + args.name + suffix) net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device) print(net) optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, eps=1e-5) step_idx = 0 total_steps = 0 best_reward = None ts_start = time.time() best_test_reward = None with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for ( mb_obs, mb_rewards, mb_actions, mb_values, _,
parser.add_argument("--seed", type=int, default=0, help="Random seed") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") torch.manual_seed(args.seed) np.random.seed(args.seed) make_env = lambda: ptan.common.wrappers.wrap_dqn( gym.make("BreakoutNoFrameskip-v4"), stack_frames=common.FRAMES_COUNT, episodic_life=False, reward_clipping=False, ) env = make_env() env = gym.wrappers.Monitor(env, args.write) net = common.AtariA2C(env.observation_space.shape, env.action_space.n) net.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) if args.cuda: net.cuda() act_selector = ptan.actions.ProbabilityActionSelector() obs = env.reset() total_reward = 0.0 total_steps = 0 while True: obs_v = ptan.agent.default_states_preprocessor([obs]).to(device) logits_v, values_v = net(obs_v) probs_v = F.softmax(logits_v)