Python update_buffer Examples

Programming Language: Python

Namespace/Package Name: elegantrl.BetaWarning.agent

Method/Function: update_buffer

Examples at hotexamples.com: 2

Python update_buffer - 2 examples found. These are the top rated real world Python examples of elegantrl.BetaWarning.agent.update_buffer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def mp_explore_in_env(args, pipe2_exp, worker_id):
    env = args.env
    reward_scale = args.reward_scale
    gamma = args.gamma
    random_seed = args.random_seed

    agent_rl = args.agent_rl
    net_dim = args.net_dim
    max_memo = args.max_memo
    target_step = args.target_step
    rollout_num = args.rollout_num
    del args

    torch.manual_seed(random_seed + worker_id)
    np.random.seed(random_seed + worker_id)
    '''init: env'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    max_step = env.max_step
    '''build agent'''
    agent = agent_rl(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    # agent.device = torch.device('cpu')  # env_cpu--act_cpu a little faster than env_cpu--act_gpu, but high cpu-util
    '''build replay buffer, init: total_step, reward_avg'''
    if_on_policy = bool(
        agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO', 'AgentInterPPO'})
    buffer = ReplayBuffer(max_memo // rollout_num + max_step,
                          state_dim,
                          if_on_policy=if_on_policy,
                          action_dim=1 if if_discrete else
                          action_dim)  # build experience replay buffer

    exp_step = target_step // rollout_num
    with torch.no_grad():
        while True:
            # pipe1_exp.send(agent.act)
            agent.act = pipe2_exp.recv()

            agent.update_buffer(env, buffer, exp_step, reward_scale, gamma)

            buffer.update__now_len__before_sample()
            pipe2_exp.send((buffer.buf_state[:buffer.now_len],
                            buffer.buf_other[:buffer.now_len]))

Example #2

Show file

def train_and_evaluate(args):
    args.init_before_training()

    cwd = args.cwd
    env = args.env
    env_eval = args.env_eval
    agent_id = args.gpu_id
    agent_rl = args.agent_rl  # basic arguments

    gamma = args.gamma  # training arguments
    net_dim = args.net_dim
    max_memo = args.max_memo
    target_step = args.target_step
    batch_size = args.batch_size
    repeat_times = args.repeat_times
    reward_scale = args.reward_scale
    if_per = args.if_per

    show_gap = args.show_gap  # evaluate arguments
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    break_step = args.break_step
    if_break_early = args.if_break_early
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: env'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    max_step = env.max_step
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    '''init: Agent, Evaluator, ReplayBuffer'''
    agent = agent_rl(net_dim, state_dim, action_dim)  # build AgentRL
    agent.state = env.reset()
    evaluator = Evaluator(cwd=cwd,
                          agent_id=agent_id,
                          device=agent.device,
                          env=env_eval,
                          eval_times1=eval_times1,
                          eval_times2=eval_times2,
                          show_gap=show_gap)  # build Evaluator

    if_on_policy = agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO'}
    buffer = ReplayBuffer(max_memo + max_step,
                          state_dim,
                          if_on_policy=if_on_policy,
                          if_per=if_per,
                          action_dim=1 if if_discrete else
                          action_dim)  # build experience replay buffer
    if if_on_policy:
        steps = 0
    else:
        with torch.no_grad():  # update replay buffer
            steps = _explore_before_train(env, buffer, target_step,
                                          reward_scale, gamma)

        agent.update_net(buffer, target_step, batch_size,
                         repeat_times)  # pre-training and hard update
        agent.act_target.load_state_dict(
            agent.act.state_dict()) if 'act_target' in dir(agent) else None
    total_step = steps

    if_solve = False
    while not ((if_break_early and if_solve) or total_step > break_step
               or os.path.exists(f'{cwd}/stop')):
        with torch.no_grad():  # speed up running
            steps = agent.update_buffer(env, buffer, target_step, reward_scale,
                                        gamma)

        total_step += steps

        obj_a, obj_c = agent.update_net(buffer, target_step, batch_size,
                                        repeat_times)

        with torch.no_grad():  # speed up running
            if_solve = evaluator.evaluate_act__save_checkpoint(
                agent.act, steps, obj_a, obj_c)