Beispiel #1
0
def train_agent_discrete(
        class_agent,
        env_name,
        cwd,
        net_dim,
        max_step,
        max_memo,
        max_epoch,  # env
        batch_size,
        gamma,
        update_gap,
        reward_scale,
        **_kwargs):  # 2020-05-20
    env = gym.make(env_name)
    '''init'''
    state_dim, action_dim, action_max, target_reward = get_env_info(
        env, is_print=True)
    assert isinstance(action_max, int)  # means Discrete action space

    agent = class_agent(env, state_dim, action_dim, net_dim)  # training agent
    buffer = BufferArray(max_memo, state_dim,
                         action_dim=1)  # experiment replay buffer
    recorder = Recorder(agent, max_step, action_max, target_reward, env_name,
                        **_kwargs)
    '''loop'''
    with torch.no_grad():  # update replay buffer
        rewards, steps = initial_exploration(env, buffer, max_step, action_max,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, 0, 0)
    try:
        for epoch in range(max_epoch):
            '''update replay buffer by interact with environment'''
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     action_max, reward_scale,
                                                     gamma)
            '''update network parameters by random sampling buffer for stochastic gradient descent'''
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, update_gap)
            '''show/check the reward, save the max reward actor'''
            with torch.no_grad():  # for saving the GPU buffer
                '''NOTICE! Recorder saves the agent with max reward automatically. '''
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
    #     return False

    train_time = recorder.print_and_save_npy(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Beispiel #2
0
def train_agent_sac(
        agent_class,
        env_name,
        cwd,
        net_dim,
        max_step,
        max_memo,
        max_epoch,  # env
        batch_size,
        gamma,
        reward_scale,
        **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(env)

    agent = agent_class(env, state_dim, action_dim, net_dim)

    memo = MemoryArray(max_memo, state_dim, action_dim)
    recorder = Recorder(agent,
                        max_step,
                        max_action,
                        target_reward,
                        env_name,
                        show_gap=2**6)

    uniform_exploration(env, max_step, max_action, gamma, reward_scale, memo,
                        action_dim)
    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # for saving the GPU memory
                rewards, steps = agent.update_memory(env, memo, max_step,
                                                     max_action, reward_scale,
                                                     gamma)

            loss_a, loss_c = agent.update_parameter(memo, max_step, batch_size)

            with torch.no_grad():  # for saving the GPU memory
                recorder.show_reward(rewards, steps, loss_a, loss_c)
                is_solved = recorder.check_reward(cwd, loss_a, loss_c)

            if is_solved:
                break
    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
    #     return False

    train_time = recorder.show_and_save(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # memo.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Beispiel #3
0
def train_agent__off_policy(class_agent, net_dim, batch_size, repeat_times,
                            gamma, reward_scale, cwd, env_name, max_step,
                            max_memo, max_epoch, **_kwargs):  # 2020-06-01
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)
    '''init'''
    agent = class_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    buffer = BufferArray(max_memo,
                         state_dim,
                         action_dim=1 if is_discrete else
                         action_dim)  # experiment replay buffer
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        **_kwargs)  # unnecessary
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, loss_a=0, loss_c=0)
    try:
        for epoch in range(max_epoch):
            # update replay buffer by interact with environment
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     max_action, reward_scale,
                                                     gamma)

            # update network parameters by random sampling buffer for gradient descent
            buffer.init_before_sample()
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, repeat_times)

            # show/check the reward, save the max reward actor
            with torch.no_grad():  # for saving the GPU buffer
                # NOTICE! Recorder saves the agent with max reward automatically.
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("| raise KeyboardInterrupt and break training loop")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")

    train_time = recorder.print_and_save_npy(env_name, cwd)

    if is_solved:
        agent.save_or_load_model(cwd, is_save=True)
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
Beispiel #4
0
def train_agent(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch,  # env
                batch_size, update_gap, gamma, exp_noise, pol_noise, reward_scale,  # update
                **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(env)

    agent = agent_class(state_dim, action_dim, net_dim)
    agent.save_or_load_model(cwd, is_save=False)

    memo_action_dim = action_dim if max_action else 1  # Discrete action space
    memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim)
    memo.save_or_load_memo(cwd, is_save=False)

    recorder = Recorder(agent, max_step, max_action, target_reward, env_name)
    r_norm = RewardNormalization(n_max=target_reward, n_min=recorder.reward_avg, size=reward_scale)

    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # just the GPU memory
                rewards, steps = agent.inactive_in_env(
                    env, memo, max_step, exp_noise, max_action, r_norm)
                memo.refresh_indices()

            actor_loss, critic_loss = agent.update_parameter(
                memo, sum(steps), batch_size, pol_noise, update_gap, gamma)

            if np.isnan(actor_loss) or np.isnan(critic_loss):
                print("ValueError: loss value should not be 'nan'. Please run again.")
                return False

            with torch.no_grad():  # just the GPU memory
                # is_solved = recorder.show_and_check_reward(
                #     epoch, epoch_reward, iter_num, actor_loss, critic_loss, cwd)
                recorder.show_reward(epoch, rewards, steps, actor_loss, critic_loss)
                is_solved = recorder.check_reward(cwd, actor_loss, critic_loss)
                if is_solved:
                    break

    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    except AssertionError:  # for BipedWalker BUG 2020-03-03
        print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
        return False

    train_time = recorder.show_and_save(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    memo.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Beispiel #5
0
def train_agent_ppo(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch,  # env
                    batch_size, gamma,
                    **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(env)

    agent = agent_class(state_dim, action_dim, net_dim)
    agent.save_or_load_model(cwd, is_save=False)

    # memo_action_dim = action_dim if max_action else 1  # Discrete action space
    # memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim)
    # memo.save_or_load_memo(cwd, is_save=False)

    state_norm = AutoNormalization((state_dim,), clip=6.0)
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        state_norm=state_norm)
    # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg)
    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # just the GPU memory
                rewards, steps, memory = agent.inactive_in_env_ppo(
                    env, max_step, max_memo, max_action, state_norm)

            l_total, l_value = agent.update_parameter_ppo(
                memory, batch_size, gamma, ep_ratio=1 - epoch / max_epoch)

            if np.isnan(l_total) or np.isnan(l_value):
                print("ValueError: loss value should not be 'nan'. Please run again.")
                return False

            with torch.no_grad():  # for saving the GPU memory
                recorder.show_reward(epoch, rewards, steps, l_value, l_total)
                is_solved = recorder.check_reward(cwd, l_value, l_total)
                if is_solved:
                    print(';;;', is_solved)
                    break

    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    except AssertionError:  # for BipedWalker BUG 2020-03-03
        print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
        return False

    train_time = recorder.show_and_save(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # memo.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Beispiel #6
0
def train_agent_ppo(class_agent, net_dim, batch_size, repeat_times, gamma,
                    reward_scale, cwd, env_name, max_step, max_memo, max_epoch,
                    **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)

    agent = class_agent(state_dim, action_dim, net_dim)
    agent.save_or_load_model(cwd, is_save=False)

    recorder = Recorder(agent, max_step, max_action, target_reward, env_name)
    # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg)
    # running_state = ZFilter((state_dim,), clip=5.0)

    try:
        for epoch in range(max_epoch):
            with torch.no_grad():  # just the GPU memory
                rewards, steps, memory = agent.inactive_in_env_ppo(
                    env, max_step, max_memo, max_action, reward_scale, gamma)

            loss_a, loss_c = agent.update_parameter_ppo(
                memory, batch_size, repeat_times)

            with torch.no_grad():  # just the GPU memory
                recorder.show_reward(rewards, steps, loss_a, loss_c)
                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
                if is_solved:
                    break

    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    except AssertionError:  # for BipedWalker BUG 2020-03-03
        print(
            "AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again."
        )
        return False

    train_time = recorder.print_and_save_npy(env_name, cwd)

    draw_plot_with_npy(cwd, train_time)
    return True
Beispiel #7
0
def process__workers(gpu_id, root_cwd, q_aggr, q_dist, args, **_kwargs):
    class_agent = args.class_agent
    env_name = args.env_name
    cwd = args.cwd
    net_dim = args.net_dim
    max_step = args.max_step
    # max_memo = args.max_memo
    max_epoch = args.max_epoch
    batch_size = args.batch_size * 1.5
    gamma = args.gamma
    update_gap = args.update_gap
    reward_scale = args.reward_scale

    cwd = '{}/{}_{}'.format(root_cwd, cwd, gpu_id)
    os.makedirs(cwd, exist_ok=True)
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    random_seed = 42 + gpu_id
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.set_default_dtype(torch.float32)
    torch.set_num_threads(8)

    env = gym.make(env_name)
    is_solved = False

    class BufferArrayMP(BufferArray):
        def init_before_sample(self):
            q_aggr.put((self.memories, is_solved))
            # self.now_len = self.max_len if self.is_full else self.next_idx

        def random_sample(self, _batch_size, device=None):
            batch_arrays = q_dist.get()
            '''convert array into torch.tensor'''
            tensors = [
                torch.tensor(ary, device=device) for ary in batch_arrays
            ]
            return tensors

    '''init'''
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=True)
    agent = class_agent(env, state_dim, action_dim, net_dim)  # training agent
    buffer = BufferArrayMP(max_step, state_dim,
                           action_dim)  # experiment replay buffer
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        **_kwargs)
    '''loop'''
    # with torch.no_grad():  # update replay buffer
    #     # rewards, steps = agent.update_buffer(
    #     #     env, buffer, max_step, max_action, reward_scale, gamma)
    #     rewards, steps = initial_exploration(
    #         env, buffer, max_step, max_action, reward_scale, gamma, action_dim)
    # recorder.show_reward(rewards, steps, 0, 0)
    try:
        for epoch in range(max_epoch):
            '''update replay buffer by interact with environment'''
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     max_action, reward_scale,
                                                     gamma)
            '''update network parameters by random sampling buffer for stochastic gradient descent'''
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, update_gap)
            '''show/check the reward, save the max reward actor'''
            with torch.no_grad():  # for saving the GPU buffer
                '''NOTICE! Recorder saves the agent with max reward automatically. '''
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
    #     return False

    train_time = recorder.print_and_save_npy(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Beispiel #8
0
def train_agent_ppo(class_agent, batch_size, repeat_times, gamma, reward_scale,
                    cwd, env_name, max_step, net_dim, max_memo, max_epoch,
                    **_kwargs):  # 2020-0430
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)
    '''default hyper-parameters for on-policy RL algorithm PPO
    max_memo = 2 ** 11
    repeat_times = 2 ** 3
    batch_size = 2 ** 8
    net_dim = 2 ** 7  
    gamma = 0.99
    
    env_name = "LunarLanderContinuous-v2"
    env_name = "BipedalWalker-v3"
    '''
    '''init'''
    agent = class_agent(state_dim, action_dim, net_dim)

    buffer = BufferListPPO(
    )  # on policy algorithm. Generalization Advantage Estimate. ICLR. 2016.
    state_norm = AutoNormalization(
        (state_dim, ),
        clip=6.0)  # on policy algorithm can do normalization for state

    recorder = Recorder(agent,
                        max_step,
                        max_action,
                        target_reward,
                        env_name,
                        state_norm=state_norm,
                        **_kwargs)
    try:
        for epoch in range(max_epoch):
            # on policy algorithm refresh replay buffer for each parameters update
            buffer.storage = list()

            # update replay buffer by interact with environment
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer_ppo(
                    env, buffer, max_step, max_memo, max_action, reward_scale,
                    gamma, state_norm)

            # update network parameters by random sampling buffer for gradient descent
            loss_a, loss_c = agent.update_parameters_ppo(
                buffer, batch_size, repeat_times)

            # show/check the reward, save the max reward actor
            with torch.no_grad():  # for saving the GPU buffer
                # NOTICE! Recorder saves the agent with max reward automatically.
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break

    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")

    train_time = recorder.print_and_save_npy(env_name, cwd)

    if is_solved:
        agent.save_or_load_model(cwd, is_save=True)
    draw_plot_with_npy(cwd, train_time)