agent.actor_perturbed = nn.DataParallel(agent.actor_perturbed) agent.critic = nn.DataParallel(agent.critic) agent.critic_target = nn.DataParallel(agent.critic_target) agent.actor.to(device) agent.actor_target.to(device) agent.actor_perturbed.to(device) agent.critic.to(device) agent.critic_target.to(device) end_str = "_{}_{}".format(args.env_name, args.model_suffix) agent.load_model("models/ddpg_actor" + end_str, "models/ddpg_critic" + end_str) while True: episode_reward = 0 state = torch.Tensor([env.reset()]).to(device) env.render() while True: action = agent.select_action(state, None, None) next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) env.render() episode_reward += reward #action = torch.Tensor(action).to(device) mask = torch.Tensor([not done]).to(device) next_state = torch.Tensor([next_state]).to(device) reward = torch.Tensor([reward]).to(device) state = next_state print("Reward: {}; Episode reward: {}".format(reward, episode_reward)) if done:
next_action = next_action + npr.normal( 0., eps, size=(action_dim, )) model_replay_buffer.push(state, action, reward, next_state, next_action, done) if len(model_replay_buffer) > batch_size: model_optim.update_model(batch_size, mini_iter=args.model_iter) state = next_state action = next_action episode_reward += reward frame_idx += 1 if args.render: env.render("human") if frame_idx % (max_frames // 10) == 0: last_reward = rewards[-1][1] if len(rewards) > 0 else 0 print('frame : {}/{}, \t last rew: {}'.format( frame_idx, max_frames, last_reward)) # pickle.dump(rewards, open(path + 'reward_data' + '.pkl', 'wb')) # torch.save(policy_net.state_dict(), path + 'policy_' + str(frame_idx) + '.pt') # torch.save(model.state_dict(), path + 'model_' + str(frame_idx) + '.pt') if args.done_util: if done: break test_reward = evaluate(env, policy_net, args.max_steps) print('ep rew', ep_num, episode_reward, test_reward)
frame_idx = 0 rewards = [] ep_num = 0 state = env.reset() mpc_planner.reset() episode_reward = 0 done = False for step in range(max_steps): action = mpc_planner.update(state) for _ in range(frame_skip): state, reward, done, _ = env.step(action.copy()) if done: break episode_reward += reward frame_idx += 1 if args.render: env.render("rgb_array", width=320 * 2, height=240 * 2) if args.done_util: if done: break print('ep rew', ep_num, episode_reward) rewards.append([frame_idx, episode_reward]) ep_num += 1 env.close()
next_action = next_action + np.random.normal( 0., eps, size=(action_dim, )) model_replay_buffer.push(state, action, reward, next_state, next_action, done) if len(model_replay_buffer) > batch_size: model_optim.update_model(batch_size, mini_iter=args.model_iter) state = next_state action = next_action episode_reward += reward frame_idx += 1 if args.render: env.render('human') if frame_idx % (max_frames // 10) == 0: last_reward = rewards[-1][1] if len(rewards) > 0 else 0 print('frame : {}/{}, \t last rew: {}'.format( frame_idx, max_frames, last_reward)) if args.log: print('saving model and reward') pickle.dump(rewards, open(path + 'reward_data' + '.pkl', 'wb')) torch.save(model.state_dict(), path + 'model_' + str(frame_idx) + '.pt') if args.done_util: if done: