Esempio n. 1
0
def train_agent(rl_agent, net_dim, batch_size, repeat_times, gamma,
                reward_scale, cwd, env_name, max_memo, max_step,
                max_total_step, eval_times1, eval_times2, gpu_id, show_gap,
                if_stop, **_kwargs):  # 2020-06-01
    env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env(
        env_name, is_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1,
                        eval_size2=eval_times2)  # todo eva_size1
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    is_online_policy = bool(
        rl_agent.__name__ in
        {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if is_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim,
                             1 if is_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # for saving the GPU buffer
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)
        # if loss_c > 4:  # todo backtracking
        #     agent.save_or_load_model(cwd, if_save=False)
        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            if_save = recorder.update__record_evaluate(env, agent.act,
                                                       max_step, max_action,
                                                       agent.device,
                                                       is_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if if_save else None
            recorder.save_npy__plot_png(cwd)

            if_solve = recorder.check_is_solved(target_reward, gpu_id,
                                                show_gap)
        '''break loop rules'''
        if_train = not ((if_stop and if_solve)
                        or recorder.total_step > max_total_step
                        or os.path.exists(f'{cwd}/stop.mark'))
    recorder.save_npy__plot_png(cwd)
Esempio n. 2
0
def train_agent(rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step,
                batch_size, repeat_times, reward_scale, gamma, break_step,
                if_break_early, show_gap, eval_times1, eval_times2,
                **_kwargs):  # 2020-09-18
    env, state_dim, action_dim, target_reward, if_discrete = build_gym_env(
        env_name, if_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2)
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    if_online_policy = bool(
        rl_agent.__name__ in
        {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if if_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim,
                             1 if if_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 if_discrete, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)
        '''pre training and hard update before training loop'''
        buffer.init_before_sample()
        agent.update_parameters(buffer, max_step, batch_size, repeat_times)
        agent.act_target.load_state_dict(agent.act.state_dict())
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 reward_scale, gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)
        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            if_save = recorder.update__record_evaluate(env, agent.act,
                                                       max_step, agent.device,
                                                       if_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if if_save else None
            recorder.save_npy__plot_png(cwd)

            if_solve = recorder.check_is_solved(target_reward, gpu_id,
                                                show_gap)
        '''break loop rules'''
        if_train = not ((if_break_early and if_solve)
                        or recorder.total_step > break_step
                        or os.path.exists(f'{cwd}/stop'))
    recorder.save_npy__plot_png(cwd)
    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)
Esempio n. 3
0
def mp__update_params(
        args, q_i_eva,
        q_o_eva):  # 2020-11-11 update network parameters using replay buffer
    rl_agent = args.rl_agent
    max_memo = args.max_memo
    net_dim = args.net_dim
    max_step = args.max_step
    max_total_step = args.break_step
    batch_size = args.batch_size
    repeat_times = args.repeat_times
    cwd = args.cwd
    env_name = args.env_name
    reward_scale = args.reward_scale
    if_stop = args.if_break_early
    gamma = args.gamma
    del args

    env, state_dim, action_dim, target_reward, if_discrete = build_env(
        env_name, if_print=False)
    '''build agent'''
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    '''send agent to q_i_eva'''
    from copy import deepcopy
    act_cpu = deepcopy(agent.act).to(torch.device("cpu"))
    act_cpu.eval()
    [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()]
    q_i_eva.put(act_cpu)  # q_i_eva 1.
    '''build replay buffer, init: total_step, reward_avg'''
    total_step = 0
    if bool(rl_agent.__name__ in {
            'AgentPPO',
    }):
        buffer = BufferTupleOnline(max_memo)
        with torch.no_grad():
            reward_avg = get_total_return(env, act_cpu, max_step,
                                          torch.device("cpu"), if_discrete)
    elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}):
        buffer = BufferArrayGPU(max_memo + max_step,
                                state_dim,
                                action_dim,
                                if_ppo=True)  # experiment replay buffer
        with torch.no_grad():
            reward_avg = get_total_return(env, act_cpu, max_step,
                                          torch.device("cpu"), if_discrete)
    else:
        buffer = BufferArrayGPU(max_memo,
                                state_dim,
                                action_dim=1 if if_discrete else action_dim,
                                if_ppo=False)
        '''initial exploration'''
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 if_discrete, reward_scale,
                                                 gamma, action_dim)
        reward_avg = np.average(rewards)
        step_sum = sum(steps)
        '''pre training and hard update before training loop'''
        buffer.update_pointer_before_sample()
        agent.update_policy(buffer, max_step, batch_size, repeat_times)
        if 'act_target' in dir(agent):
            agent.act_target.load_state_dict(agent.act.state_dict())

        q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0))  # q_i_eva n.
        total_step += step_sum
    '''training loop'''
    if_train = True
    if_solve = False
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 reward_scale, gamma)
        reward_avg = np.average(rewards) if len(rewards) else reward_avg
        step_sum = sum(steps)
        total_step += step_sum
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.update_pointer_before_sample()
        loss_a_avg, loss_c_avg = agent.update_policy(buffer, max_step,
                                                     batch_size, repeat_times)
        '''saves the agent with max reward'''
        act_cpu.load_state_dict(agent.act.state_dict())
        q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg,
                     loss_c_avg))  # q_i_eva n.

        if q_o_eva.qsize() > 0:
            if_solve = q_o_eva.get()  # q_o_eva n.
        '''break loop rules'''
        if_train = not ((if_stop and if_solve) or total_step > max_total_step
                        or os.path.exists(f'{cwd}/stop'))

    env, state_dim, action_dim, target_reward, if_discrete = build_env(
        env_name, if_print=False)
    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)

    q_i_eva.put('stop')
    while q_i_eva.qsize() > 0:
        time.sleep(1)
    time.sleep(4)