envs = [wrap_dqn(gym.make(args.env)) for _ in range(ENVS_COUNT)] test_env = wrap_dqn(gym.make(args.env)) net_act = model.ModelActor(envs[0].observation_space.shape, envs[0].action_space.n).to(device) net_crt = model.ModelCritic(envs[0].observation_space.shape).to(device) print(net_act) print(net_crt) writer = SummaryWriter(comment="-acktr_" + args.name) agent = model.AgentA2C(net_act, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, GAMMA, steps_count=REWARD_STEPS) opt_act = kfac.KFACOptimizer(net_act, lr=LEARNING_RATE_ACTOR) opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) batch = [] best_reward = None with ptan.common.utils.RewardTracker(writer) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for step_idx, exp in enumerate(exp_source): rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", np.mean(steps), step_idx) tracker.reward(np.mean(rewards), step_idx) if step_idx % TEST_ITERS == 0:
action='store_true', help="Enable Acktr-specific tweaks") args = parser.parse_args() get_link_state = rospy.ServiceProxy("/gazebo/get_link_state", GetLinkState) pitch = 0 rospy.Subscriber('/Bobby/imu', Imu, get_angular_vel) counter = 0 env = make_env(args) if args.record: env = wrappers.Monitor(env, args.record) net = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0], args.hid) if args.acktr: opt = kfac.KFACOptimizer(net) net.load_state_dict(torch.load(args.model)) obs = env.reset() total_reward = 0.0 total_steps = 0 while True: obs_v = torch.FloatTensor(obs) mu_v = net(obs_v) action = mu_v.squeeze(dim=0).data.numpy() action = np.clip(action, -1, 1) if np.isscalar(action): action = [action] obs, reward, done, _ = env.step(action) total_reward += reward
envs[0].action_space.shape[0]).to(device) crt_net = model.ModelCritic(envs[0].observation_space.shape[0]).to(device) print(act_net) print(crt_net) if args.act_model: act_net.load_state_dict(torch.load(args.act_model)) if args.crt_model: crt_net.load_state_dict(torch.load(args.crt_model)) writer = SummaryWriter(comment='-a2c_' + args.name) agent = model.AgentA2C(act_net, device) exp_source = drl.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEP) act_optimizer = kfac.KFACOptimizer(act_net.parameters(), lr=LEARNING_RATE_ACTOR) crt_optimizer = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE_CRITIC) batch = [] best_reward = None with drl.tracker.RewardTracker(writer) as tracker: with drl.tracker.TBMeanTracker(writer, 10) as tb_tracker: for step_idx, exp in enumerate(exp_source): rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], step_idx) tracker.reward(rewards[0], step_idx)