print("Starting Ensemble!") imitator.train_ensemble() print("Finished Ensemble!") # create agent agent = Agent(env, imitator.policy.actor, args.device, running_state=running_state, render=args.render, num_threads=args.num_threads) log_list = {"bc_loss": [], "uncertainty_cost":[], "avg_reward": [], "std_reward": []} total_timesteps = 0 for i_iter in range(args.max_iter_num): batch, log = agent.collect_samples(args.min_batch_size) # train DRIL t0 = time.time() loss = imitator.train(batch) t1 = time.time() imitator.policy.actor.to('cpu') episode_rewards = evaluate_model(env, imitator.policy.actor, running_state=running_state, verbose=False)[ 'episodes_rewards'] imitator.policy.actor.to(args.device) if i_iter % args.log_interval == 0: print('{}\tT_update: {:.4f}\t training loss: {:.2f}\t uncertainty cost: {:.4f}' '\t R_avg: {:.2f}\t R_std: {:.2f}'.format( i_iter, t1 - t0, loss['bc_loss'], loss['uncertainty_cost'], episode_rewards.mean(), episode_rewards.std()))
# args.expert_traj_path = "assets/expert_traj/{}_{}_0.p".format(args.env_name, args.expert_model) # expert_traj, running_state, _ = pickle.load(open(args.expert_traj_path, "rb")) # running_state.fix = True args.expert_path = "assets/expert_models/{}_ppo_0.p".format(args.env_name) # load expert and state normalization expert, _, running_state, _ = pickle.load(open(args.expert_path, "rb")) running_state.fix = True expert_agent = Agent(env, expert, args.device, running_state=running_state, render=args.render, num_threads=args.num_threads) expert_traj, expert_log = expert_agent.collect_samples( args.min_batch_size * args.num_trajs, return_memory=True) print(expert_log['avg_reward']) # train discriminator # regressor = BC(args, state_dim, action_dim, is_disc_action) # regressor.set_expert2(expert_traj.sample()) # t0 = time.time() # # for _ in range(10): # regressor.train2() # t1 = time.time() # print("Finished training BC regressor, took {}".format(t1-t0)) # running_state.fix = False
optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=exp_args["config"]["lr"]) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=exp_args["config"]["lr"]) """ Create Agent """ agent = Agent(env, policy_net, device, running_state=running_state, render=exp_args["config"]["render"], num_threads=exp_args["config"]["num-threads"], horizon=exp_args["config"]["horizon"]) agent.collect_samples(2048) def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) advantages, returns = estimate_advantages(rewards, masks, values, exp_args["config"]["gamma"], exp_args["config"]["tau"],
# running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) # env.seed(args.seed) try: policy_net, value_net, discrim_net, running_state = pickle.load(open(args.file, "rb")) print('1') except: policy_net, value_net, running_state = pickle.load(open(args.file, "rb")) print('2') print('type running state = ',type(running_state)) print policy_net.to(device) value_net.to(device) """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, num_threads=1, mean_action=True) batch, log = agent.collect_samples(args.max_timesteps) print('R_min {0:.2f}\tR_max {1:.2f}\tR_avg {2:.2f}\tNum_episodes {3:.2f}'.format(log['min_reward'], log['max_reward'], log['avg_reward'], log['num_episodes'])) env.shutdown()