コード例 #1
0
ファイル: rl_main.py プロジェクト: voot-t/vild_code
def main(args):
    if use_gpu:
        torch.backends.cudnn.deterministic = True
        print(colored("Using CUDA.", p_color))
        torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    test_cpu = False      # True to avoid moving gym's state to gpu tensor every step during testing.

    """ Create environment and get environment's info. """
    if args.env_atari:
        from my_utils.atari_wrappers import Task 
        env = Task(args.env_name, num_envs=1, clip_rewards=False, seed=args.seed)     
        env_test = Task(args.env_name, num_envs=1, clip_rewards=False, seed=args.seed)     
    elif args.env_bullet:
        import pybullet 
        import pybullet_envs 
        pybullet.connect(pybullet.DIRECT)
        env = gym.make(args.env_name)
        env.seed(args.seed)  
        env_test = env        
        if args.render:
            env_test.render(mode="human")
    else: 
        env = gym.make(args.env_name)     
        env_test = gym.make(args.env_name)
        env.seed(args.seed)  
        env_test.seed(args.seed)  

    state_dim = env.observation_space.shape[0]
    is_disc_action = args.env_discrete
    action_dim = (0 if is_disc_action else env.action_space.shape[0])
    if is_disc_action:
        a_bound = 1
        action_num = env.action_space.n 
        print("State dim: %d, action num: %d" % (state_dim, action_num))
    else:
        """ always normalize env. """ 
        from my_utils.my_gym_utils import NormalizeGymWrapper
        env = NormalizeGymWrapper(env)
        env_test = NormalizeGymWrapper(env_test)
        a_bound = np.asscalar(env.action_space.high[0])
        a_low = np.asscalar(env.action_space.low[0])
        assert a_bound == -a_low 
        print("State dim: %d, action dim: %d, action bound %d" % (state_dim, action_dim, a_bound))

    """ Set method and hyper parameter in file name"""
    method_name = args.rl_method.upper()
    hypers = rl_hypers_parser(args)     
    exp_name = "%s-%s_s%d" % (method_name, hypers, args.seed)

    """ Set path for result and model files """
    result_path = "./RL_results/%s/%s/%s-%s" % (method_name, args.env_name, args.env_name, exp_name)
    model_path = "./RL_results/%s_models/%s/%s-%s" % (args.rl_method.upper(), args.env_name, args.env_name, exp_name) 
    pathlib.Path("./RL_results/%s/%s" % (method_name, args.env_name)).mkdir(parents=True, exist_ok=True) 
    if platform.system() != "Windows":
        pathlib.Path("./RL_results/%s_models/%s" % (method_name, args.env_name)).mkdir(parents=True, exist_ok=True) 
    print("Running %s" % (colored(method_name, p_color)))
    print("%s result will be saved at %s" % (colored(method_name, p_color), colored(result_path, p_color)))

    """define actor and critic"""
    if is_disc_action:
        if args.rl_method == "dqn":
            policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=False, cnn=args.cnn)
        if args.rl_method == "ddqn":
            policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=True, cnn=args.cnn)
        if args.rl_method == "qr_dqn":
            policy_updater = QR_DQN(state_dim=state_dim, action_num=action_num, args=args, cnn=args.cnn)
        if args.rl_method == "clipped_ddqn":
            policy_updater = Clipped_DDQN(state_dim=state_dim, action_num=action_num, args=args, cnn=args.cnn)
        if args.rl_method == "ppo":
            policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=action_num, is_discrete=True, cnn=args.cnn)
    else:
        if args.rl_method == "ac":
            policy_updater = AC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
        if args.rl_method == "sac":
            policy_updater = SAC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
        if args.rl_method == "td3":
            policy_updater = TD3(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
        if args.rl_method == "trpo":
            policy_updater = TRPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
        if args.rl_method == "ppo":
            policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
    update_type = policy_updater.update_type  # "on_policy" or "off_policy"
    if args.max_step is None:
        if update_type == "on_policy":
            args.max_step = 5000000
        elif update_type == "off_policy":
            args.max_step = 1000000
        if args.env_atari:
            args.max_step = args.max_step * 10 
        
    """ Function to update the parameters of value and policy networks"""
    def update_params_g(batch):
        states = torch.FloatTensor(np.stack(batch.state)).to(device)
        next_states = torch.FloatTensor(np.stack(batch.next_state)).to(device)
        masks = torch.FloatTensor(np.stack(batch.mask)).to(device).unsqueeze(-1)
        rewards = torch.FloatTensor(np.stack(batch.reward)).to(device).unsqueeze(-1)
        actions = torch.LongTensor(np.stack(batch.action)) if is_disc_action else torch.FloatTensor(np.stack(batch.action))

        policy_updater.update_policy(states, actions.to(device), next_states, rewards, masks)
    
    """ Storage and counters """
    memory = Memory(capacity=1000000)   # Memory buffer with 1 million max size.
    step, i_iter, tt_g = 0, 0, 0
    perform_test = 0    
    log_interval = args.max_step // 1000     # 1000 lines in the text files
    save_model_interval = (log_interval * 10) * (platform.system() != "Windows")  # do not save on my windows laptop
    print("Max steps: %s, Log interval: %s steps, Model interval: %s steps" % \
         (colored(args.max_step, p_color), colored(log_interval, p_color), colored(save_model_interval, p_color)))

    """ Reset seed again """  
    if use_gpu:
        torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    """ Agent for testing in a separated environemnt """
    agent_test = Agent(env_test, render=args.render, t_max=args.t_max, test_cpu=test_cpu)
    if args.env_bullet: 
        log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)

    state = env.reset()
    """ The actual learning loop"""
    for total_step in range(0, args.max_step + 1):

        """ Save the learned policy model """
        if save_model_interval > 0 and total_step % save_model_interval == 0: 
            policy_updater.save_model("%s_policy_T%d.pt" % (model_path, total_step))

        """ Test the policy before update """
        if total_step % log_interval == 0:
            perform_test = 1
         
        if perform_test:
            if args.env_bullet: 
                if done:
                    log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)
                    perform_test = 0
            else:
                log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)
                perform_test = 0

        """ take env step """
        if total_step <= args.random_action and update_type == "off_policy":
            action = env.action_space.sample()
        else:
            action = policy_updater.sample_action(torch.FloatTensor(state).to(device).unsqueeze(0)).to(device_cpu).detach().numpy()

        next_state, reward, done, _ = env.step(action)

        if step + 1 == args.t_max:
            done = 1
        memory.push(state, action, int(not done), next_state, reward, 0)
        state = next_state
        step = step + 1        

        """ reset env """
        if done :  # reset
            state = env.reset()
            step = 0
                          
        """ Update policy """
        if update_type == "on_policy":
            if memory.size() >= args.big_batch_size and done :
                t0_g = time.time()
                batch = memory.sample()
                update_params_g(batch=batch) 
                memory.reset() 
                tt_g += time.time() - t0_g

        elif update_type == "off_policy":
            if total_step >= args.big_batch_size:         
                t0_g = time.time()
                batch = memory.sample(args.mini_batch_size)     
                update_params_g(batch=batch)  
                tt_g += time.time() - t0_g
                     
        """ Print out result to stdout and save it to a text file for plotting """
        if total_step % log_interval == 0:
        
            result_text = t_format("Step %7d " % (total_step), 0) \
                        + t_format("(g%2.2f)s" % (tt_g), 1) 
            result_text += " | [R_te] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                            + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            
            if (args.rl_method == "sac" or args.rl_method == "vac"):
                result_text += ("| ent %0.3f" % (policy_updater.entropy_coef))

            tt_g = 0
            print(result_text)
            with open(result_path + ".txt", 'a') as f:
                print(result_text, file=f) 
コード例 #2
0
def main(args):

    if args.il_method is None:
        method_type = "RL"  # means we just do RL with environment's rewards 
        info_method = False 
        encode_dim = 0 
    else:
        method_type = "IL"
        if "info" in args.il_method:
            info_method = True
            encode_dim = args.encode_dim
        else:
            info_method = False 
            encode_dim = 0 

    torch.manual_seed(args.seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.seed)
        torch.backends.cudnn.deterministic = True
        print(colored("Using CUDA.", p_color))
    np.random.seed(args.seed)
    random.seed(args.seed)
    test_cpu = True      # Set to True to avoid moving gym's state to gpu tensor every step during testing.

    env_name = args.env_name 
    """ Create environment and get environment's info. """
    if args.env_atari:
        from my_utils.atari_wrappers import Task 
        env = Task(env_name, num_envs=1, clip_rewards=False, seed=args.seed)     
        env_test = Task(env_name, num_envs=1, clip_rewards=False, seed=args.seed)     
    elif args.env_bullet:
        import pybullet 
        import pybullet_envs 
        pybullet.connect(pybullet.DIRECT)
        env = gym.make(env_name)
        env.seed(args.seed)  
        env_test = env        
        if args.render:
            env_test.render(mode="human")
    elif args.env_robosuite:
        from my_utils.my_robosuite_utils import make_robosuite_env
        args.t_max = 500 
        env = make_robosuite_env(args)
        env_test = make_robosuite_env(args)
        # the sampler use functions from python's random, so the seed are already set.
        env_name = args.env_name + "_reach"

    else: 
        env = gym.make(env_name)     
        env.seed(args.seed)  
        env_test = gym.make(env_name)
        env_test.seed(args.seed)  

    state_dim = env.observation_space.shape[0]
    is_disc_action = args.env_discrete
    action_dim = (0 if is_disc_action else env.action_space.shape[0])
    if args.env_robosuite:
        action_dim = action_dim - 1     # we disable gripper for reaching 
    if is_disc_action:
        a_bound = 1
        action_num = env.action_space.n 
        print("State dim: %d, action num: %d" % (state_dim, action_num))
    else:
        """ always normalize env. """ 
        if np.asscalar(env.action_space.high[0]) != 1:
            from my_utils.my_gym_utils import NormalizeGymWrapper
            env = NormalizeGymWrapper(env)
            env_test = NormalizeGymWrapper(env_test)
            print("Use state-normalized environments.")
        a_bound = np.asscalar(env.action_space.high[0])
        a_low = np.asscalar(env.action_space.low[0])
        assert a_bound == -a_low 
        assert a_bound == 1 
        print("State dim: %d, action dim: %d, action bound %d" % (state_dim, action_dim, a_bound))

        if "LunarLanderContinuous" in env_name or "BipedalWalker" in env_name:
            from my_utils.my_gym_utils import ClipGymWrapper
            env = ClipGymWrapper(env) 
            env_test = ClipGymWrapper(env_test) 

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    """define actor and critic"""
    if is_disc_action:  # work in progress...
        if args.rl_method == "dqn":
            policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=False, encode_dim=encode_dim)
        if args.rl_method == "ddqn":
            policy_updater = DQN(state_dim=state_dim, action_num=action_num, args=args, double_q=True, encode_dim=encode_dim)
        if args.rl_method == "qr_dqn":
            policy_updater = QR_DQN(state_dim=state_dim, action_num=action_num, args=args, encode_dim=encode_dim)
        if args.rl_method == "clipped_ddqn":
            policy_updater = Clipped_DDQN(state_dim=state_dim, action_num=action_num, args=args, encode_dim=encode_dim)
        if args.rl_method == "ppo":
            policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=action_num, is_discrete=True, encode_dim=encode_dim)
    else:
        if args.rl_method == "ac":
            policy_updater = AC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim)
        if args.rl_method == "sac":
            policy_updater = SAC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim)
        if args.rl_method == "td3":
            policy_updater = TD3(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim)
        if args.rl_method == "trpo":
            policy_updater = TRPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim)
        if args.rl_method == "ppo":
            policy_updater = PPO(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound, encode_dim=encode_dim)

    update_type = policy_updater.update_type  # "on_policy" or "off_policy"
    if args.max_step is None:
        if update_type == "on_policy":
            args.max_step = 5000000
            if args.psi_param_std is None: args.psi_param_std = 0 
        elif update_type == "off_policy":
            args.max_step = 1000000     
            if args.psi_param_std is None: args.psi_param_std = 1 
        if args.env_atari:
            args.max_step = args.max_step * 10 
        
    if method_type == "IL":
        if args.il_method == "irl": # maximum entropy IRL
            discriminator_updater = IRL(state_dim=state_dim, action_dim=action_dim, args=args)
        elif args.il_method == "gail":
            discriminator_updater = GAIL(state_dim=state_dim, action_dim=action_dim, args=args)
        elif args.il_method == "vail":
            discriminator_updater = VAIL(state_dim=state_dim, action_dim=action_dim, args=args)
        elif args.il_method == "airl":
            discriminator_updater = AIRL(state_dim=state_dim, action_dim=action_dim, args=args, policy_updater=policy_updater)  # need entropy coefficient and policy         
        elif args.il_method == "vild":  
            discriminator_updater = VILD(state_dim=state_dim, action_dim=action_dim, args=args, policy_updater=policy_updater)   # need entropy coefficient           
        elif args.il_method == "infogail":  
            discriminator_updater = InfoGAIL(state_dim=state_dim, action_dim=action_dim, args=args, policy_updater=policy_updater)   # AIRL version need entropy coefficent and policy   

        # pretrain pi for robosuite env. 
        if args.env_robosuite :
            discriminator_updater.behavior_cloning(policy_net=policy_updater.policy_net, learning_rate=args.learning_rate_pv, bc_step=args.bc_step) # pretrain pi 
        elif args.il_method == "vild":  # pretrain only q_psi
            discriminator_updater.behavior_cloning(policy_net=None, learning_rate=args.learning_rate_pv, bc_step=args.bc_step) 

    """ Set method and hyper parameter in file name"""
    if method_type == "RL":
        method_name = args.rl_method.upper()
        hypers = rl_hypers_parser(args)    
    else:
        method_name = args.il_method.upper() + "_" + args.rl_method.upper()
        hypers = rl_hypers_parser(args) + "_" + irl_hypers_parser(args)         
        
        if args.il_method == "vild" and args.vild_loss_type.lower() != "linear":
            method_name += "_" + args.vild_loss_type.upper()   
        
        if args.il_method == "infogail" and args.info_loss_type.lower() != "bce":
            method_name += "_" + args.info_loss_type.upper()

    if method_type == "RL":
        exp_name = "%s-%s_s%d" % (method_name, hypers, args.seed)
    elif method_type == "IL":
        exp_name = "%s-%s-%s_s%d" % (discriminator_updater.traj_name, method_name, hypers, args.seed)

    """ Set path for result and model files """
    result_path = "./results_%s/%s/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name)
    model_path = "./results_%s/%s_models/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name) 
    pathlib.Path("./results_%s/%s/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) 
    # if platform.system() != "Windows":
    pathlib.Path("./results_%s/%s_models/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) 
    print("Running %s" % (colored(method_name, p_color)))
    print("%s result will be saved at %s" % (colored(method_name, p_color), colored(result_path, p_color)))

    """ Function to update the parameters of value and policy networks"""
    def update_params_g(batch):
        states = torch.FloatTensor(np.stack(batch.state)).to(device)
        next_states = torch.FloatTensor(np.stack(batch.next_state)).to(device)
        masks = torch.FloatTensor(np.stack(batch.mask)).to(device).unsqueeze(-1)

        actions = torch.LongTensor(np.stack(batch.action)).to(device) if is_disc_action else torch.FloatTensor(np.stack(batch.action)).to(device) 

        if method_type == "RL":
            rewards = torch.FloatTensor(np.stack(batch.reward)).to(device).unsqueeze(-1)
            policy_updater.update_policy(states, actions.to(device), next_states, rewards, masks)
        elif method_type == "IL":
            nonlocal d_rewards 
            d_rewards = discriminator_updater.compute_reward(states, actions).detach().data
            
            # Append one-hot vector of context to state.
            if info_method:
                latent_codes = torch.LongTensor(np.stack(batch.latent_code)).to(device).view(-1,1)    # [batch_size, 1] 
                d_rewards += discriminator_updater.compute_posterior_reward(states, actions, latent_codes).detach().data

                latent_codes_onehot = torch.FloatTensor(states.size(0), encode_dim).to(device)
                latent_codes_onehot.zero_()
                latent_codes_onehot.scatter_(1, latent_codes, 1)  #should have size [batch_size, num_worker]

                states = torch.cat((states, latent_codes_onehot), 1) 
                next_states = torch.cat((next_states, latent_codes_onehot), 1)  

            policy_updater.update_policy(states, actions, next_states, d_rewards, masks)
    
    """ Storage and counters """
    memory = Memory(capacity=1000000)   # Memory buffer with 1 million max size.
    step, i_iter, tt_g, tt_d, perform_test = 0, 0, 0, 0, 0
    d_rewards = torch.FloatTensor(1).fill_(0)   ## placeholder
    log_interval = args.max_step // 1000     # 1000 lines in the text files
    if args.env_robosuite:
        log_interval = args.max_step // 500 # reduce to 500 lines to save experiment time
    save_model_interval = (log_interval * 10) # * (platform.system() != "Windows")  # do not save model ?
    print("Max steps: %s, Log interval: %s steps, Model interval: %s steps" % \
         (colored(args.max_step, p_color), colored(log_interval, p_color), colored(save_model_interval, p_color)))

    """ Reset seed again """  
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    """ Agent for testing in a separated environemnt """
    agent_test = Agent(env_test, render=args.render, t_max=args.t_max, test_cpu=test_cpu)
    if args.env_bullet: 
        log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)

    latent_code = None ## only for infogail 
    state = env.reset()
    done = 1 
    """ The actual learning loop"""
    for total_step in range(0, args.max_step + 1):

        """ Save the learned policy model """
        if save_model_interval > 0 and total_step % save_model_interval == 0: 
            policy_updater.save_model("%s_policy_T%d.pt" % (model_path, total_step))

        """ Test the policy before update """
        if total_step % log_interval == 0:
            perform_test = 1
         
        """ Test learned policy """
        if perform_test:
            if not info_method:
                if not args.env_bullet:
                    log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)
                    perform_test = 0
                elif done: # Because env and env_test are the same object for pybullet. 
                    log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)
                    perform_test = 0
            else:
                log_test = []
                for i_k in range(0, encode_dim):
                    # latent_code_test = discriminator_updater.sample_code().fill_(i_k)   # legacy code that change rng sequences. Use this line to reproduce old results. 
                    latent_code_test = torch.LongTensor(size=(1,1)).fill_(i_k)
                    latent_code_onehot_test = torch.FloatTensor(1, encode_dim)
                    latent_code_onehot_test.zero_()
                    latent_code_onehot_test.scatter_(1, latent_code_test, 1)
                    log_test += [agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10, latent_code_onehot=latent_code_onehot_test.squeeze() )] # use 1 instead of 10 to save time?
                perform_test = 0

        if info_method and latent_code is None:
            latent_code = discriminator_updater.sample_code()    #sample scalar latent code from the prior p(c) which is uniform. 
            latent_code_onehot = torch.FloatTensor(1, encode_dim)
            latent_code_onehot.zero_()
            latent_code_onehot.scatter_(1, latent_code, 1)
            latent_code_onehot = latent_code_onehot.squeeze()  #should have size [encode_dim]
            latent_code = latent_code.detach().numpy()

        state_var = torch.FloatTensor(state)
        if latent_code is not None:
            state_var = torch.cat((state_var, latent_code_onehot), 0)  

        """ take env step """
        if total_step <= args.random_action and update_type == "off_policy":    # collect random actions first for off policy methods
            action = env.action_space.sample()
        else:
            action = policy_updater.sample_action(state_var.to(device).unsqueeze(0)).to(device_cpu).detach().numpy()

        if args.il_method == "vild":    # Add noise from Sigma_k to action (noise_t = sqrt(Sigma_k)) 
            action_u = action + args.noise_t * np.random.normal( np.zeros(action.shape), np.ones(action.shape) )
            next_state, reward, done, _ = env.step(action_u)
        else:
            next_state, reward, done, _ = env.step(action)

        if step + 1 == args.t_max:
            done = 1
        memory.push(state, action, int(not done), next_state, reward, latent_code)
        state = next_state
        step = step + 1        

        """ reset env """
        if done :  # reset
            state = env.reset()
            step = 0
            latent_code = None 
                        
        """ Update policy """
        if update_type == "on_policy":
            if memory.size() >= args.big_batch_size and done :
                batch = memory.sample()

                if method_type == "IL":
                    for i_d in range(0, args.d_step):
                        index = discriminator_updater.index_sampler()   # should be inside update_discriminator for cleaner code...
                        t0_d = time.time()
                        discriminator_updater.update_discriminator(batch=batch, index=index, total_step=total_step) 
                        tt_d += time.time() - t0_d

                t0_g = time.time()
                update_params_g(batch=batch) 
                tt_g += time.time() - t0_g
                memory.reset() 

        elif update_type == "off_policy":
            if total_step >= args.big_batch_size:       

                if method_type == "IL":
                    index = discriminator_updater.index_sampler()
                    batch = memory.sample(args.mini_batch_size)    
                    t0_d = time.time()
                    discriminator_updater.update_discriminator(batch=batch, index=index, total_step=total_step) 
                    tt_d += time.time() - t0_d  
                elif method_type == "RL":
                    batch = memory.sample(args.mini_batch_size)    
                    
                t0_g = time.time()
                update_params_g(batch=batch)  
                tt_g += time.time() - t0_g
                       
        """ Print out result to stdout and save it to a text file for plotting """
        if total_step % log_interval == 0:
        
            result_text = t_format("Step %7d " % (total_step), 0) 
            if method_type == "RL":
                result_text += t_format("(g%2.2f)s" % (tt_g), 1)  
            elif method_type == "IL":
                c_reward_list = d_rewards.to(device_cpu).detach().numpy()
                result_text += t_format("(g%2.1f+d%2.1f)s" % (tt_g, tt_d), 1) 
                result_text += " | [D] " + t_format("min: %.2f" % np.amin(c_reward_list), 0.5) + t_format(" max: %.2f" % np.amax(c_reward_list), 0.5)

            result_text += " | [R_te] "
            if not info_method:
                result_text += t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                    + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            else:        
                result_text += "Avg " 
                for i_k in range(0, encode_dim):
                    result_text += t_format("%d: %.2f (%.2f)" % (i_k, log_test[i_k]['avg_reward'], log_test[i_k]['std_reward']), 2)
        
            if (args.rl_method == "sac"):
                result_text += ("| ent %0.3f" % (policy_updater.entropy_coef))

            if args.il_method == "vild":
                ## check estimated worker noise
                estimated_worker_noise = discriminator_updater.worker_net.get_worker_cov().to(device_cpu).detach().numpy().squeeze()
                if action_dim > 1:
                    estimated_worker_noise = estimated_worker_noise.mean(axis=0)  #average across action dim
                result_text += " | w_noise: %s" % (np.array2string(estimated_worker_noise, formatter={'float_kind':lambda x: "%.5f" % x}).replace('\n', '') )
                    
            tt_g = 0
            tt_d = 0

            print(result_text)
            with open(result_path + ".txt", 'a') as f:
                print(result_text, file=f) 
コード例 #3
0
ファイル: learn_model.py プロジェクト: Quanticnova/td-reg
def learn_model(args):

    print("RL result will be saved at %s" % args.rl_filename)
    print("RL model will be saved at %s" % args.rl_model_filename)
    if use_gpu:
        print("Using CUDA.")

    torch.manual_seed(args.rl_seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.rl_seed)
        torch.backends.cudnn.deterministic = True
    np.random.seed(args.rl_seed)
    random.seed(args.rl_seed)

    env = gym.make(args.env_name)
    env.seed(args.rl_seed)

    env_test = gym.make(args.env_name)
    env_test.seed(args.rl_seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    a_bound = np.asscalar(env.action_space.high[0])
    a_low = np.asscalar(env.action_space.low[0])
    assert a_bound == -a_low

    ## Binary flag for manually cliping actions for step function after adding Gaussian noise.
    clip = (args.env_name == "LunarLanderContinuous-v2"
            or args.env_name == "BipedalWalker-v2")

    print(env.observation_space)
    print(env.action_space)
    """define actor and critic"""
    policy_net = Policy(state_dim,
                        action_dim,
                        log_std=args.log_std,
                        a_bound=a_bound,
                        hidden_size=args.hidden_size,
                        activation=args.activation).to(device)
    value_net = Value(state_dim,
                      hidden_size=args.hidden_size,
                      activation=args.activation).to(device)

    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate_v)
    decayed_lambda_td = args.lambda_td

    def update_params_c(batch, i_iter):
        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, args.gamma, args.tau)

        if args.lamret:
            returns = lambda_returns
        else:
            returns = mc_returns
        """perform critic update"""
        #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg)  # full batch GD
        gae_step_epoch(value_net, optimizer_value, states, returns,
                       args.l2_reg)  # Stochastic GD

    """ Function to update the parameters of value and policy networks"""

    def update_params_p(batch, i_iter):

        nonlocal decayed_lambda_td

        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        next_states = torch.from_numpy(np.stack(
            batch.next_state)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories, this is done after gae_step update"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, gamma=args.gamma, tau=args.tau)

        if args.method_name == "TRPO-RET-MC":
            returns = mc_returns.detach(
            )  # detach() does not matter since we back prop policy network only.
        elif args.method_name == "TRPO-RET-GAE":
            returns = lambda_returns.detach(
            )  # detach() does not matter actually.
        else:
            returns = 0  # returns is not used for TRPO and TRPO-TD.

        # standardize or not ?
        if args.mgae:
            advantages = (advantages - advantages.mean()
                          ) / advantages.std()  # this will be m-std version
        else:
            advantages = advantages / advantages.std(
            )  # this will be std version

        trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \
            max_kl=args.max_kl, damping=args.damping, \
            lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd)
        """ decay the td_reg parameter after update """
        decayed_lambda_td = decayed_lambda_td * args.decay_td

    """create agent"""
    agent = Agent(env, policy_net, render=False)
    agent_test = Agent(env_test,
                       policy_net,
                       mean_action=True,
                       render=args.render)
    """ The actual learning loop"""
    for i_iter in range(args.rl_max_iter_num):
        """ Save the learned policy model """
        if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \
            or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0:

            policy_net = policy_net.to(device_cpu)
            value_net = value_net.to(device_cpu)

            pickle.dump((policy_net, value_net),
                        open(args.rl_model_filename + ("_I%d.p" % (i_iter)),
                             'wb'))

            policy_net = policy_net.to(device)
            value_net = value_net.to(device)
        """ Test the policy before update """
        if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num:
            _, log_test = agent_test.collect_samples_test(max_num_episodes=20,
                                                          render=args.render,
                                                          clip=clip)
        """generate multiple trajectories that reach the minimum batch_size"""
        t0 = time.time()
        batch, log = agent.collect_samples_train(
            args.min_batch_size, render=False,
            clip=clip)  # this is on-policy samples
        t1 = time.time()
        """ update parameters """
        t0_d = time.time()
        update_params_c(batch, i_iter)  #critic update
        update_params_p(batch, i_iter)  #actor update
        t1_d = time.time()
        """ Print out result to stdout and save it to a text file for later usage"""
        if i_iter % args.log_interval == 0:

            result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" %
                                   (i_iter, t1 - t0, t1_d - t0_d))
            result_text += " | [R] " + t_format(
                "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2)
            result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                            + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            print(result_text)

            with open(args.rl_filename, 'a') as f:
                print(result_text, file=f)
コード例 #4
0
ファイル: bc_main.py プロジェクト: voot-t/vild_code
def main(args):

    if args.il_method is None:
        raise NotImplementedError
    else:
        method_type = "IL"
        
    torch.manual_seed(args.seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.seed)
        torch.backends.cudnn.deterministic = True
        print(colored("Using CUDA.", p_color))
    np.random.seed(args.seed)
    random.seed(args.seed)
    test_cpu = True      # True to avoid moving gym's state to gpu tensor every step during testing.

    env_name = args.env_name 
    """ Create environment and get environment's info. """
    if args.env_atari:
        from my_utils.atari_wrappers import Task 
        env = Task(env_name, num_envs=1, clip_rewards=False, seed=args.seed)     
        env_test = env 
    elif args.env_bullet:
        import pybullet 
        import pybullet_envs 
        pybullet.connect(pybullet.DIRECT)
        env = gym.make(env_name)
        env.seed(args.seed)  
        env_test = env        
        if args.render:
            env_test.render(mode="human")
    elif args.env_robosuite:
        from my_utils.my_robosuite_utils import make_robosuite_env
        args.t_max = 500 
        env = make_robosuite_env(args)
        env_test = env
        # the sampler use functions from python's random, so the seed are already set.
        env_name = args.env_name + "_reach"

    else: 
        env = gym.make(env_name)     
        env.seed(args.seed)  
        env_test = env

    state_dim = env.observation_space.shape[0]
    is_disc_action = args.env_discrete
    action_dim = (0 if is_disc_action else env.action_space.shape[0])
    if args.env_robosuite:
        action_dim = action_dim - 1     # we disable gripper for reaching 
    if is_disc_action:
        a_bound = 1
        action_num = env.action_space.n 
        print("State dim: %d, action num: %d" % (state_dim, action_num))
    else:
        """ always normalize env. """ 
        if np.asscalar(env.action_space.high[0]) != 1:
            from my_utils.my_gym_utils import NormalizeGymWrapper
            env = NormalizeGymWrapper(env)
            env_test = NormalizeGymWrapper(env_test)
            print("Use state-normalized environments.")
        a_bound = np.asscalar(env.action_space.high[0])
        a_low = np.asscalar(env.action_space.low[0])
        assert a_bound == -a_low 
        assert a_bound == 1 
        print("State dim: %d, action dim: %d, action bound %d" % (state_dim, action_dim, a_bound))

        if "LunarLanderContinuous" in env_name or "BipedalWalker" in env_name:
            from my_utils.my_gym_utils import ClipGymWrapper
            env = ClipGymWrapper(env) 
            env_test = ClipGymWrapper(env_test) 

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    args.max_step = 1000000     
    if args.il_method == "bc":
        policy_updater = BC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
    elif args.il_method == "dbc":
        policy_updater = DBC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
    elif args.il_method == "cobc":
        policy_updater = COBC(state_dim=state_dim, action_dim=action_dim, args=args, a_bound=a_bound)
    discriminator_updater = policy_updater 

    update_type = policy_updater.update_type  # "off_policy"
    
    """ Set method and hyper parameter in file name"""
    method_name = args.il_method.upper()
    hypers = bc_hypers_parser(args)     
    exp_name = "%s-%s-%s_s%d" % (discriminator_updater.traj_name, method_name, hypers, args.seed)

    """ Set path for result and model files """
    result_path = "./results_%s/%s/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name)
    model_path = "./results_%s/%s_models/%s/%s-%s" % (method_type, method_name, env_name, env_name, exp_name) 
    pathlib.Path("./results_%s/%s/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) 
    # if platform.system() != "Windows":
    pathlib.Path("./results_%s/%s_models/%s" % (method_type, method_name, env_name)).mkdir(parents=True, exist_ok=True) 
    print("Running %s" % (colored(method_name, p_color)))
    print("%s result will be saved at %s" % (colored(method_name, p_color), colored(result_path, p_color)))

    """ Storage and counters """
    step, i_iter, tt_g, tt_d, perform_test = 0, 0, 0, 0, 0
    log_interval = args.max_step // 1000     # 1000 lines in the text files
    if args.env_robosuite:
        log_interval = args.max_step // 500 # reduce to 500 lines to save experiment time
    save_model_interval = (log_interval * 10) # * (platform.system() != "Windows")  # do not save on my windows laptop
    print("Max steps: %s, Log interval: %s steps, Model interval: %s steps" % \
         (colored(args.max_step, p_color), colored(log_interval, p_color), colored(save_model_interval, p_color)))

    # """ Reset seed again """  
    # torch.manual_seed(args.seed)
    # np.random.seed(args.seed)
    # random.seed(args.seed)

    """ Agent for testing in a separated environemnt """
    agent_test = Agent(env_test, render=args.render, t_max=args.t_max, test_cpu=test_cpu)
    if args.env_bullet: 
        log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)

    latent_code = None ## only for infogail 
    # state = env.reset()
    """ The actual learning loop"""
    for total_step in range(0, args.max_step + 1):

        """ Save the learned policy model """
        if save_model_interval > 0 and total_step % save_model_interval == 0: 
            policy_updater.save_model("%s_policy_T%d.pt" % (model_path, total_step))

        """ Test the policy before update """
        if total_step % log_interval == 0:
            perform_test = 1
         
        """ Test learned policy """
        if perform_test:
            log_test = agent_test.collect_samples_test(policy=policy_updater, max_num_episodes=10)
            train_acc = policy_updater.evaluate_train_accuray()
            perform_test = 0
            
        """ Update policy """    
        t0_g = time.time()
        policy_updater.update_policy(total_step)  
        tt_g += time.time() - t0_g
                
        """ Print out result to stdout and save it to a text file for plotting """
        if total_step % log_interval == 0:
        
            result_text = t_format("Step %7d " % (total_step), 0) 
            result_text += t_format("(bc%2.1f)s" % (tt_g), 0) 

            result_text += " | [R_te] "
            result_text += t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
         
            result_text += " | [MSE_tr] " + t_format(" %.4f" % (train_acc), 0)
            
            if args.il_method == "dbc":
                ## check estimated worker noise
                estimated_worker_noise = policy_updater.worker_net.get_worker_cov().to(device_cpu).detach().numpy().squeeze()
                if action_dim > 1:
                    estimated_worker_noise = estimated_worker_noise.mean(axis=0)  #average across action dim
                result_text += " | w_noise: %s" % (np.array2string(estimated_worker_noise, formatter={'float_kind':lambda x: "%.3f" % x}).replace('\n', '') )

            tt_g = 0

            print(result_text)
            with open(result_path + ".txt", 'a') as f:
                print(result_text, file=f)