def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs, ), clip=5) # huh? # oh wow. ZFilter is exactly what I do in capstone project, removing "badtimes" print('state size:', num_inputs) print('action size:', num_actions) #load agent stuff actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) #if you aren't starting from scratch, load in this if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) # initialize everything actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) # if no old model no worries, start training. episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): # for i total trajectories actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: # sample trajectories (batch size) state = env.reset() score = 0 state = running_state( state) #uh.. again ZFilter related, cleans the state for _ in range(10000): #run through environment if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze( 0)) #pass state through actor network action = get_action(mu, std)[0] #compute random action next_state, reward, done, _ = env.step(action) #take a step irl_reward = get_reward( discrim, state, action ) #infer what the reward of this action is based on discriminator's get reward if done: mask = 0 else: mask = 1 #if done, save this, memory.append([state, action, irl_reward, mask]) next_state = running_state( next_state) #save cleaned next state state = next_state #and set to current state, score += reward #add total reward if done: break #actual sampling done here episodes += 1 scores.append(score) score_avg = np.mean(scores) #how this model did, print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) #logg actor.train(), critic.train(), discrim.train() #now train if train_discrim_flag: #if this batch optimizes discrim/reward, # for training the discriminator expert_acc, learner_acc = train_discrim( discrim, memory, discrim_optim, demonstrations, args) # see comments in train_model. print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False #now restart, train policy. #for training actor critic train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) # no output, see comments in train_model if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def main(): expert_demo = pickle.load(open('./Ree1_expert.p', "rb")) # Ree1 : action 1 # Ree2 : action 100 # Ree3 : action 50 # Ree4 : action 10 # Ree5 : action 4 # Ree6 : action 0.5 # print('expert_demo_shape : ', np.array(expert_demo).shape) expert_x = int(expert_demo[1][0]) expert_y = int(expert_demo[1][1]) env = Env(expert_x, expert_y) # env = Env(0,0) # env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = 2 num_actions = 8 running_state = ZFilter((num_inputs, ), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo[0]) # print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(1000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) # next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) temp_learner.append(learner_acc * 100) temp_expert.append(expert_acc * 100) if ((expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen and iter % 55 == 0) or iter % 50 == 0): # train_discrim_flag = False plt.plot(temp_learner, label='learner') plt.plot(temp_expert, label='expert') plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.legend() plt.savefig('accuracy{}.png'.format(iter)) # plt.show() model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') print("check path", ckpt_path) save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) plt.plot(temp_learner) plt.plot(temp_expert) plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.savefig('accuracy.png')
def train(env_fn, seed=0, ppo_epoch=10, steps_per_epoch=2048, mini_batch_size=64, num_epoch=1500, gamma=0.99, clip_ratio=0.2, value_clip_ratio=10, value_loss_coef=0.5, entropy_loss_coef=0, use_value_clipped_loss=True, lr=3e-4, eps=1e-5, lam=0.95, max_grad_norm=0.5, max_ep_len=1000, save_freq=10, device=torch.device('cpu'), ac_kwargs=dict(), logger_kwargs=dict()): # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() env.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.shape actor_critic = MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs).to(device) ppo = PPO(actor_critic, clip_ratio, value_clip_ratio, ppo_epoch, mini_batch_size, value_loss_coef, entropy_loss_coef, lr, eps, max_grad_norm, use_value_clipped_loss) # Set up experience buffer buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam, device) # Set up model saving logger.setup_pytorch_saver(ppo.actor_critic) # Prepare for interaction with environment start_time = time.time() running_state = ZFilter((obs_dim[0], ), clip=10) # running_reward = ZFilter((1,), demean=False, clip=10) obs, ep_ret, ep_len = env.reset(), 0, 0 obs = running_state(obs) # Main loop: collect experience in env and update/log each epoch for epoch in range(num_epoch): for t in range(steps_per_epoch): action, value, logp = ppo.actor_critic.step( torch.as_tensor(obs, dtype=torch.float32).to(device)) next_obs, rew, done, _ = env.step(action) next_obs = running_state(next_obs) # rew = running_reward([rew])[0] ep_ret += rew ep_len += 1 # save and log buf.store(obs, action, rew, value, logp) # Update obs (critical!) obs = next_obs timeout = ep_len == max_ep_len terminal = done or timeout epoch_ended = t == steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, value, _ = ppo.actor_critic.step( torch.as_tensor(obs, dtype=torch.float32).to(device)) else: value = 0 buf.finish_path(value) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) obs, ep_ret, ep_len = env.reset(), 0, 0 obs = running_state(obs) # Save model if save_freq != 0 and ((epoch % save_freq == 0) or (epoch == num_epoch - 1)): logger.save_state({'env': env}, None) # perform update data = buf.get() policy_loss, value_loss, entropy, kl = ppo.update(data) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', policy_loss) logger.log_tabular('LossV', value_loss) logger.log_tabular('Entropy', entropy) logger.log_tabular('KL', kl) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def main(): expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb")) demonstrations = np.array(expert_demo[0]) print("demonstrations.shape", demonstrations.shape) print(expert_demo[1]) print(expert_demo[0]) print(np.array(expert_demo[0]).shape) # expert_x = int(expert_demo[1][0]) # expert_y = int(expert_demo[1][1]) expert_x = int(expert_demo[0][0]) expert_y = int(expert_demo[0][1]) env = Env(expert_x, expert_y) # env.seed(args.seed) # torch.manual_seed(args.seed) num_inputs = 6 num_actions = 8 running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) vdb = VDB(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate) # load demonstrations k = 1 writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) vdb.load_state_dict(ckpt['vdb']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb")) print(iter) expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb")) tmp = expert_demo.pop(-1) demonstrations = np.array(expert_demo) print(demonstrations, demonstrations.shape) tot_sample_size = len(demonstrations) + 10 ########################## actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] # while steps < args.total_sample_size: while steps < tot_sample_size: # env.delete_graph() state = env.reset() # time.sleep(1) score = 0 # state = running_state(state) state1 = state for _ in range((tot_sample_size+1)*2): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) irl_reward = get_reward(vdb, state, action) # ###### 동영상 촬영용 # if iter > 11500 : # time.sleep(0.015) # ##### if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) # next_state = running_state(next_state) state = next_state score += reward if done: break ########################## env.draw_graph() env.render() ########################## episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), vdb.train() if train_discrim_flag: expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) #### score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
new_raw = np.array(new_raw) print(new_raw.shape) all_data = np.zeros((50000, 14), dtype=np.float64) for i in range(50000): all_data[i][:] = new_raw[i%164][:] print(all_data.shape) # all_data = np.reshape(all_data, 50000, 14) # print(all_data.shape) # new_data = np.reshape(new_data, (2, 2, 1)) # new_data = [draw[0][i], draw[1][i], draw[2][i]] running_state = ZFilter((14,), clip=5) zfilter_data = np.zeros((50000, 14), dtype=np.float64) a = np.array(all_data[-1][:],dtype=float) for i in range(50000): zfilter_data[i][:] = running_state(all_data[i][:]) print('zfilter data = ' ,all_data[-1][:],running_state(all_data[-1][:])) # with open('expert_demo.p','rb') as f: data2 = pickle.load(f) print(data2,'\n',np.array(data2)[0].shape)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
if __name__ == "__main__": env = Env(20, 20) # env.seed(500) torch.manual_seed(500) num_inputs = 2 num_actions = 8 print("state size: ", num_inputs) print("action size: ", num_actions) actor = Actor(num_inputs, num_actions,args) critic = Critic(num_inputs,args) running_state = ZFilter((num_inputs,), clip=5) # running_state = ZFilter((100*100,), clip=5) # print(running_state) if args.load_model is not None: pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) pretrained_model = torch.load(pretrained_model_path) actor.load_state_dict(pretrained_model['actor']) critic.load_state_dict(pretrained_model['critic']) running_state.rs.n = pretrained_model['z_filter_n'] running_state.rs.mean = pretrained_model['z_filter_m'] running_state.rs.sum_square = pretrained_model['z_filter_s']
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) vdb = VDB(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) vdb.load_state_dict(ckpt['vdb']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(vdb, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{} episode score is {:.2f}'.format(episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), vdb.train() train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args) train_ppo(actor, critic, memory, actor_optim, critic_optim, args)