Python BufferTupleOnline Examples

Programming Language: Python

Namespace/Package Name: AgentZoo

Examples at hotexamples.com: 3

Python BufferTupleOnline - 3 examples found. These are the top rated real world Python examples of AgentZoo.BufferTupleOnline extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BufferTupleOnline(4)

print_state_norm(3)

init_before_sample(2)

update_pointer_before_sample(2)

Example #1

Show file

def train_agent(rl_agent, net_dim, batch_size, repeat_times, gamma,
                reward_scale, cwd, env_name, max_memo, max_step,
                max_total_step, eval_times1, eval_times2, gpu_id, show_gap,
                if_stop, **_kwargs):  # 2020-06-01
    env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env(
        env_name, is_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1,
                        eval_size2=eval_times2)  # todo eva_size1
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    is_online_policy = bool(
        rl_agent.__name__ in
        {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if is_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim,
                             1 if is_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # for saving the GPU buffer
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)
        # if loss_c > 4:  # todo backtracking
        #     agent.save_or_load_model(cwd, if_save=False)
        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            if_save = recorder.update__record_evaluate(env, agent.act,
                                                       max_step, max_action,
                                                       agent.device,
                                                       is_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if if_save else None
            recorder.save_npy__plot_png(cwd)

            if_solve = recorder.check_is_solved(target_reward, gpu_id,
                                                show_gap)
        '''break loop rules'''
        if_train = not ((if_stop and if_solve)
                        or recorder.total_step > max_total_step
                        or os.path.exists(f'{cwd}/stop.mark'))
    recorder.save_npy__plot_png(cwd)

Example #2

Show file

def train_agent(rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step,
                batch_size, repeat_times, reward_scale, gamma, break_step,
                if_break_early, show_gap, eval_times1, eval_times2,
                **_kwargs):  # 2020-09-18
    env, state_dim, action_dim, target_reward, if_discrete = build_gym_env(
        env_name, if_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2)
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    if_online_policy = bool(
        rl_agent.__name__ in
        {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if if_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim,
                             1 if if_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 if_discrete, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)
        '''pre training and hard update before training loop'''
        buffer.init_before_sample()
        agent.update_parameters(buffer, max_step, batch_size, repeat_times)
        agent.act_target.load_state_dict(agent.act.state_dict())
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 reward_scale, gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)
        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            if_save = recorder.update__record_evaluate(env, agent.act,
                                                       max_step, agent.device,
                                                       if_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if if_save else None
            recorder.save_npy__plot_png(cwd)

            if_solve = recorder.check_is_solved(target_reward, gpu_id,
                                                show_gap)
        '''break loop rules'''
        if_train = not ((if_break_early and if_solve)
                        or recorder.total_step > break_step
                        or os.path.exists(f'{cwd}/stop'))
    recorder.save_npy__plot_png(cwd)
    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)

Example #3

Show file

def mp__update_params(
        args, q_i_eva,
        q_o_eva):  # 2020-11-11 update network parameters using replay buffer
    rl_agent = args.rl_agent
    max_memo = args.max_memo
    net_dim = args.net_dim
    max_step = args.max_step
    max_total_step = args.break_step
    batch_size = args.batch_size
    repeat_times = args.repeat_times
    cwd = args.cwd
    env_name = args.env_name
    reward_scale = args.reward_scale
    if_stop = args.if_break_early
    gamma = args.gamma
    del args

    env, state_dim, action_dim, target_reward, if_discrete = build_env(
        env_name, if_print=False)
    '''build agent'''
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    '''send agent to q_i_eva'''
    from copy import deepcopy
    act_cpu = deepcopy(agent.act).to(torch.device("cpu"))
    act_cpu.eval()
    [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()]
    q_i_eva.put(act_cpu)  # q_i_eva 1.
    '''build replay buffer, init: total_step, reward_avg'''
    total_step = 0
    if bool(rl_agent.__name__ in {
            'AgentPPO',
    }):
        buffer = BufferTupleOnline(max_memo)
        with torch.no_grad():
            reward_avg = get_total_return(env, act_cpu, max_step,
                                          torch.device("cpu"), if_discrete)
    elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}):
        buffer = BufferArrayGPU(max_memo + max_step,
                                state_dim,
                                action_dim,
                                if_ppo=True)  # experiment replay buffer
        with torch.no_grad():
            reward_avg = get_total_return(env, act_cpu, max_step,
                                          torch.device("cpu"), if_discrete)
    else:
        buffer = BufferArrayGPU(max_memo,
                                state_dim,
                                action_dim=1 if if_discrete else action_dim,
                                if_ppo=False)
        '''initial exploration'''
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 if_discrete, reward_scale,
                                                 gamma, action_dim)
        reward_avg = np.average(rewards)
        step_sum = sum(steps)
        '''pre training and hard update before training loop'''
        buffer.update_pointer_before_sample()
        agent.update_policy(buffer, max_step, batch_size, repeat_times)
        if 'act_target' in dir(agent):
            agent.act_target.load_state_dict(agent.act.state_dict())

        q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0))  # q_i_eva n.
        total_step += step_sum
    '''training loop'''
    if_train = True
    if_solve = False
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 reward_scale, gamma)
        reward_avg = np.average(rewards) if len(rewards) else reward_avg
        step_sum = sum(steps)
        total_step += step_sum
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.update_pointer_before_sample()
        loss_a_avg, loss_c_avg = agent.update_policy(buffer, max_step,
                                                     batch_size, repeat_times)
        '''saves the agent with max reward'''
        act_cpu.load_state_dict(agent.act.state_dict())
        q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg,
                     loss_c_avg))  # q_i_eva n.

        if q_o_eva.qsize() > 0:
            if_solve = q_o_eva.get()  # q_o_eva n.
        '''break loop rules'''
        if_train = not ((if_stop and if_solve) or total_step > max_total_step
                        or os.path.exists(f'{cwd}/stop'))

    env, state_dim, action_dim, target_reward, if_discrete = build_env(
        env_name, if_print=False)
    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)

    q_i_eva.put('stop')
    while q_i_eva.qsize() > 0:
        time.sleep(1)
    time.sleep(4)