Example #1
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs, ), clip=5)  # huh?
    # oh wow. ZFilter is exactly what I do in capstone project, removing "badtimes"

    print('state size:', num_inputs)
    print('action size:', num_actions)

    #load agent stuff
    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    #if you aren't starting from scratch, load in this
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        # initialize everything
        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    # if no old model no worries, start training.
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        # for i total trajectories
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            # sample trajectories  (batch size)
            state = env.reset()
            score = 0

            state = running_state(
                state)  #uh.. again ZFilter related, cleans the state

            for _ in range(10000):
                #run through environment
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(
                    0))  #pass state through actor network
                action = get_action(mu, std)[0]  #compute random action
                next_state, reward, done, _ = env.step(action)  #take a step
                irl_reward = get_reward(
                    discrim, state, action
                )  #infer what the reward of this action is based on discriminator's get reward

                if done:
                    mask = 0
                else:
                    mask = 1  #if done, save this,

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(
                    next_state)  #save cleaned next state
                state = next_state  #and set to current state,

                score += reward  #add total reward

                if done:
                    break
                #actual sampling done here

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)  #how this model did,
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)  #logg

        actor.train(), critic.train(), discrim.train()  #now train
        if train_discrim_flag:  #if this batch optimizes discrim/reward,
            # for training the discriminator
            expert_acc, learner_acc = train_discrim(
                discrim, memory, discrim_optim, demonstrations,
                args)  # see comments in train_model.
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False  #now restart, train policy.
        #for training actor critic
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)  # no output, see comments in train_model

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
Example #2
0
def main():
    expert_demo = pickle.load(open('./Ree1_expert.p', "rb"))
    # Ree1 : action 1
    # Ree2 : action 100
    # Ree3 : action 50
    # Ree4 : action 10
    # Ree5 : action 4
    # Ree6 : action 0.5

    # print('expert_demo_shape : ', np.array(expert_demo).shape)
    expert_x = int(expert_demo[1][0])
    expert_y = int(expert_demo[1][1])
    env = Env(expert_x, expert_y)
    # env = Env(0,0)

    # env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = 2
    num_actions = 8
    running_state = ZFilter((num_inputs, ), clip=5)

    print('state size:', num_inputs)
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))

    demonstrations = np.array(expert_demo[0])

    # print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            state = env.reset()
            score = 0

            state = running_state(state)

            for _ in range(1000):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)
                # next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory,
                                                    discrim_optim,
                                                    demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))

            temp_learner.append(learner_acc * 100)
            temp_expert.append(expert_acc * 100)

            if ((expert_acc > args.suspend_accu_exp
                 and learner_acc > args.suspend_accu_gen and iter % 55 == 0)
                    or iter % 50 == 0):
                # train_discrim_flag = False
                plt.plot(temp_learner, label='learner')
                plt.plot(temp_expert, label='expert')
                plt.xlabel('Episode')
                plt.ylabel('Accuracy')
                plt.xticks([])
                plt.legend()
                plt.savefig('accuracy{}.png'.format(iter))
                # plt.show()

                model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
                ckpt_path = os.path.join(model_path,
                                         'ckpt_' + str(score_avg) + '.pth.tar')

                print("check path", ckpt_path)
                save_checkpoint(
                    {
                        'actor': actor.state_dict(),
                        'critic': critic.state_dict(),
                        'discrim': discrim.state_dict(),
                        'z_filter_n': running_state.rs.n,
                        'z_filter_m': running_state.rs.mean,
                        'z_filter_s': running_state.rs.sum_square,
                        'args': args,
                        'score': score_avg
                    },
                    filename=ckpt_path)

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
    plt.plot(temp_learner)
    plt.plot(temp_expert)
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.xticks([])
    plt.savefig('accuracy.png')
Example #3
0
def train(env_fn,
          seed=0,
          ppo_epoch=10,
          steps_per_epoch=2048,
          mini_batch_size=64,
          num_epoch=1500,
          gamma=0.99,
          clip_ratio=0.2,
          value_clip_ratio=10,
          value_loss_coef=0.5,
          entropy_loss_coef=0,
          use_value_clipped_loss=True,
          lr=3e-4,
          eps=1e-5,
          lam=0.95,
          max_grad_norm=0.5,
          max_ep_len=1000,
          save_freq=10,
          device=torch.device('cpu'),
          ac_kwargs=dict(),
          logger_kwargs=dict()):

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    env.seed(seed)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    actor_critic = MLPActorCritic(env.observation_space, env.action_space,
                                  **ac_kwargs).to(device)

    ppo = PPO(actor_critic, clip_ratio, value_clip_ratio, ppo_epoch,
              mini_batch_size, value_loss_coef, entropy_loss_coef, lr, eps,
              max_grad_norm, use_value_clipped_loss)

    # Set up experience buffer
    buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam, device)

    # Set up model saving
    logger.setup_pytorch_saver(ppo.actor_critic)

    # Prepare for interaction with environment
    start_time = time.time()
    running_state = ZFilter((obs_dim[0], ), clip=10)
    # running_reward = ZFilter((1,), demean=False, clip=10)
    obs, ep_ret, ep_len = env.reset(), 0, 0
    obs = running_state(obs)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(num_epoch):
        for t in range(steps_per_epoch):
            action, value, logp = ppo.actor_critic.step(
                torch.as_tensor(obs, dtype=torch.float32).to(device))

            next_obs, rew, done, _ = env.step(action)
            next_obs = running_state(next_obs)
            # rew = running_reward([rew])[0]
            ep_ret += rew
            ep_len += 1

            # save and log
            buf.store(obs, action, rew, value, logp)

            # Update obs (critical!)
            obs = next_obs

            timeout = ep_len == max_ep_len
            terminal = done or timeout
            epoch_ended = t == steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, value, _ = ppo.actor_critic.step(
                        torch.as_tensor(obs, dtype=torch.float32).to(device))
                else:
                    value = 0
                buf.finish_path(value)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    obs, ep_ret, ep_len = env.reset(), 0, 0
                    obs = running_state(obs)

        # Save model
        if save_freq != 0 and ((epoch % save_freq == 0) or
                               (epoch == num_epoch - 1)):
            logger.save_state({'env': env}, None)

        # perform update
        data = buf.get()
        policy_loss, value_loss, entropy, kl = ppo.update(data)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', policy_loss)
        logger.log_tabular('LossV', value_loss)
        logger.log_tabular('Entropy', entropy)
        logger.log_tabular('KL', kl)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Example #4
0
def main():
    expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb"))
    demonstrations = np.array(expert_demo[0])

    print("demonstrations.shape", demonstrations.shape)

    print(expert_demo[1])
    print(expert_demo[0])
    print(np.array(expert_demo[0]).shape)

    # expert_x = int(expert_demo[1][0])
    # expert_y = int(expert_demo[1][1])

    expert_x = int(expert_demo[0][0])
    expert_y = int(expert_demo[0][1])


    env = Env(expert_x, expert_y)

    # env.seed(args.seed)
    # torch.manual_seed(args.seed)

    num_inputs = 6
    num_actions = 8
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    vdb = VDB(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate)
    
    # load demonstrations

    k = 1
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        vdb.load_state_dict(ckpt['vdb'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True



    for iter in range(args.max_iter_num):
        # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb"))
        print(iter)
        expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb"))
        tmp = expert_demo.pop(-1)

        demonstrations = np.array(expert_demo)

        print(demonstrations, demonstrations.shape)
        tot_sample_size = len(demonstrations) + 10
        ##########################

        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        # while steps < args.total_sample_size:

        while steps < tot_sample_size:
            # env.delete_graph()
            state = env.reset()
            # time.sleep(1)

            score = 0

            # state = running_state(state)
            state1 = state
            for _ in range((tot_sample_size+1)*2):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)

                irl_reward = get_reward(vdb, state, action)

                # ###### 동영상 촬영용
                # if iter > 11500 :
                #     time.sleep(0.015)
                # #####
                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                # next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            ##########################
            env.draw_graph()
            env.render()
            ##########################
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), vdb.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'vdb': vdb.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)

    ####
    score_avg = int(score_avg)

    model_path = os.path.join(os.getcwd(), 'save_model')
    if not os.path.isdir(model_path):
        os.makedirs(model_path)

    ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar')

    save_checkpoint({
        'actor': actor.state_dict(),
        'critic': critic.state_dict(),
        'vdb': vdb.state_dict(),
        'z_filter_n': running_state.rs.n,
        'z_filter_m': running_state.rs.mean,
        'z_filter_s': running_state.rs.sum_square,
        'args': args,
        'score': score_avg
    }, filename=ckpt_path)
Example #5
0
new_raw = np.array(new_raw)
print(new_raw.shape)

all_data = np.zeros((50000, 14), dtype=np.float64)

for i in range(50000):
    all_data[i][:] = new_raw[i%164][:]

print(all_data.shape)
# all_data = np.reshape(all_data, 50000, 14)
# print(all_data.shape)
# new_data = np.reshape(new_data, (2, 2, 1))

# new_data = [draw[0][i], draw[1][i], draw[2][i]]
running_state = ZFilter((14,), clip=5)

zfilter_data = np.zeros((50000, 14), dtype=np.float64)
a = np.array(all_data[-1][:],dtype=float)

for i in range(50000):
      zfilter_data[i][:] = running_state(all_data[i][:])
print('zfilter data = ' ,all_data[-1][:],running_state(all_data[-1][:]))



#
with open('expert_demo.p','rb') as f:
    data2 = pickle.load(f)
print(data2,'\n',np.array(data2)[0].shape)
Example #6
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Example #7
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Example #8
0
if __name__ == "__main__":

    env = Env(20, 20)
    # env.seed(500)
    torch.manual_seed(500)

    num_inputs = 2
    num_actions = 8

    print("state size: ", num_inputs)
    print("action size: ", num_actions)

    actor = Actor(num_inputs, num_actions,args)
    critic = Critic(num_inputs,args)

    running_state = ZFilter((num_inputs,), clip=5)
    # running_state = ZFilter((100*100,), clip=5)

    # print(running_state)

    if args.load_model is not None:
        pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))

        pretrained_model = torch.load(pretrained_model_path)

        actor.load_state_dict(pretrained_model['actor'])
        critic.load_state_dict(pretrained_model['critic'])

        running_state.rs.n = pretrained_model['z_filter_n']
        running_state.rs.mean = pretrained_model['z_filter_m']
        running_state.rs.sum_square = pretrained_model['z_filter_s']
Example #9
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    vdb = VDB(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        vdb.load_state_dict(ckpt['vdb'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(vdb, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{} episode score is {:.2f}'.format(episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), vdb.train() 
        train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
        train_ppo(actor, critic, memory, actor_optim, critic_optim, args)