Esempio n. 1
0
def train_agent(rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step,
                batch_size, repeat_times, reward_scale, gamma, break_step,
                if_break_early, show_gap, eval_times1, eval_times2,
                **_kwargs):  # 2020-09-18
    env, state_dim, action_dim, target_reward, if_discrete = build_gym_env(
        env_name, if_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1,
                        eval_size2=eval_times2)  # todo eva_size1
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    if_online_policy = bool(
        rl_agent.__name__ in
        {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if if_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim,
                             1 if if_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 if_discrete, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)

        # todo pre training and hard update before loop
        buffer.init_before_sample()
        agent.update_parameters(buffer, max_step, batch_size, repeat_times)
        agent.act_target.load_state_dict(agent.act.state_dict())
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 reward_scale, gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)
        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            if_save = recorder.update__record_evaluate(env, agent.act,
                                                       max_step, agent.device,
                                                       if_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if if_save else None
            recorder.save_npy__plot_png(cwd)

            if_solve = recorder.check_is_solved(target_reward, gpu_id,
                                                show_gap)
        '''break loop rules'''
        if_train = not ((if_break_early and if_solve)
                        or recorder.total_step > break_step
                        or os.path.exists(f'{cwd}/stop.mark'))
    recorder.save_npy__plot_png(cwd)
    buffer.print_state_norm(env.neg_state_avg,
                            env.div_state_std)  # todo norm para
Esempio n. 2
0
def train_agent_discrete(
        class_agent,
        env_name,
        cwd,
        net_dim,
        max_step,
        max_memo,
        max_epoch,  # env
        batch_size,
        gamma,
        update_gap,
        reward_scale,
        **_kwargs):  # 2020-05-20
    env = gym.make(env_name)
    '''init'''
    state_dim, action_dim, action_max, target_reward = get_env_info(
        env, is_print=True)
    assert isinstance(action_max, int)  # means Discrete action space

    agent = class_agent(env, state_dim, action_dim, net_dim)  # training agent
    buffer = BufferArray(max_memo, state_dim,
                         action_dim=1)  # experiment replay buffer
    recorder = Recorder(agent, max_step, action_max, target_reward, env_name,
                        **_kwargs)
    '''loop'''
    with torch.no_grad():  # update replay buffer
        rewards, steps = initial_exploration(env, buffer, max_step, action_max,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, 0, 0)
    try:
        for epoch in range(max_epoch):
            '''update replay buffer by interact with environment'''
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     action_max, reward_scale,
                                                     gamma)
            '''update network parameters by random sampling buffer for stochastic gradient descent'''
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, update_gap)
            '''show/check the reward, save the max reward actor'''
            with torch.no_grad():  # for saving the GPU buffer
                '''NOTICE! Recorder saves the agent with max reward automatically. '''
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
    #     return False

    train_time = recorder.print_and_save_npy(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
Esempio n. 3
0
def train_agent__off_policy(class_agent, net_dim, batch_size, repeat_times,
                            gamma, reward_scale, cwd, env_name, max_step,
                            max_memo, max_epoch, **_kwargs):  # 2020-06-01
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)
    '''init'''
    agent = class_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    buffer = BufferArray(max_memo,
                         state_dim,
                         action_dim=1 if is_discrete else
                         action_dim)  # experiment replay buffer
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        **_kwargs)  # unnecessary
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, loss_a=0, loss_c=0)
    try:
        for epoch in range(max_epoch):
            # update replay buffer by interact with environment
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     max_action, reward_scale,
                                                     gamma)

            # update network parameters by random sampling buffer for gradient descent
            buffer.init_before_sample()
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, repeat_times)

            # show/check the reward, save the max reward actor
            with torch.no_grad():  # for saving the GPU buffer
                # NOTICE! Recorder saves the agent with max reward automatically.
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("| raise KeyboardInterrupt and break training loop")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")

    train_time = recorder.print_and_save_npy(env_name, cwd)

    if is_solved:
        agent.save_or_load_model(cwd, is_save=True)
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
Esempio n. 4
0
def train_offline_policy(rl_agent, net_dim, batch_size, repeat_times, gamma,
                         reward_scale, cwd, env_name, max_step, max_memo,
                         max_total_step, **_kwargs):  # 2020-06-01
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)
    assert not is_discrete
    '''init: agent, buffer, recorder'''
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    buffer = BufferArray(max_memo, state_dim,
                         action_dim)  # experiment replay buffer
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        **_kwargs)  # unnecessary
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, loss_a=0, loss_c=0)

    while True:
        # update replay buffer by interact with environment
        with torch.no_grad():  # for saving the GPU buffer
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma)

        # update network parameters by random sampling buffer for gradient descent
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)

        # show/check the reward, save the max reward actor
        with torch.no_grad():  # for saving the GPU buffer
            # NOTICE! Recorder saves the agent with max reward automatically.
            recorder.show_reward(rewards, steps, loss_a, loss_c)

            is_solved = recorder.check_reward(cwd, loss_a, loss_c)
        if is_solved:
            print('Reach target_reward: ', target_reward, recorder.reward_max)
            break
        if recorder.total_step > max_total_step:
            print('Reach target_step: ', max_total_step, recorder.total_step)
            break

    train_time = recorder.print_and_save_npy(env_name, cwd)

    if is_solved:
        agent.save_or_load_model(cwd, is_save=True)
    draw_plot_with_npy(cwd, train_time)
Esempio n. 5
0
def train_agent(
        rl_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd,
        env_name, max_step, max_memo, max_total_step,
        eva_size, gpu_id, show_gap, **_kwargs):  # 2020-06-01
    env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env(env_name, is_print=False)

    '''init: agent, buffer, recorder'''
    recorder = Recorder()
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    is_online_policy = bool(rl_agent.__name__ in {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if is_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim, 1 if is_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)

    '''loop'''
    is_training = True
    while is_training:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # for saving the GPU buffer
            rewards, steps = agent.update_buffer(
                env, buffer, max_step, max_action, reward_scale, gamma)

        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(
            buffer, max_step, batch_size, repeat_times)

        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            is_saved = recorder.update__record_evaluate(
                env, agent.act, max_step, max_action, eva_size, agent.device, is_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if is_saved else None

            is_solved = recorder.check_is_solved(target_reward, gpu_id, show_gap)
        '''break loop rules'''
        if is_solved or recorder.total_step > max_total_step or os.path.exists(f'{cwd}/stop.mark'):
            is_training = False

    recorder.save_npy__plot_png(cwd)
Esempio n. 6
0
def process__buffer(q_aggr, qs_dist, args, **_kwargs):
    max_memo = args.max_memo
    env_name = args.env_name
    max_step = args.max_step
    batch_size = args.batch_size
    repeat_times = 2

    reward_scale = args.reward_scale
    gamma = args.gamma
    '''init'''
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(
        env, is_print=False)
    buffer = BufferArray(max_memo, state_dim,
                         action_dim)  # experiment replay buffer

    workers_num = len(qs_dist)
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(
        #     env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)

    is_training = True
    while is_training:
        for i in range(workers_num):
            memo_array, is_solved = q_aggr.get()
            buffer.extend_memo(memo_array)
            if is_solved:
                is_training = False

        buffer.init_before_sample()
        for i in range(max_step * repeat_times):
            # batch_arrays = buffer.random_sample(batch_size, device=None) # todo
            for q_dist in qs_dist:
                batch_arrays = buffer.random_sample(batch_size,
                                                    device=None)  # todo slower
                q_dist.put(batch_arrays)

    print('|| Exit: process__buffer')
Esempio n. 7
0
def mp__update_params(args, q_i_eva, q_o_eva):  # 2020-11-11 update network parameters using replay buffer
    rl_agent = args.rl_agent
    max_memo = args.max_memo
    net_dim = args.net_dim
    max_step = args.max_step
    max_total_step = args.break_step
    batch_size = args.batch_size
    repeat_times = args.repeat_times
    cwd = args.cwd
    env_name = args.env_name
    reward_scale = args.reward_scale
    if_stop = args.if_break_early
    gamma = args.gamma
    del args

    env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False)

    '''build agent'''
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    '''send agent to q_i_eva'''
    from copy import deepcopy
    act_cpu = deepcopy(agent.act).to(torch.device("cpu"))
    act_cpu.eval()
    [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()]
    q_i_eva.put(act_cpu)  # q_i_eva 1.

    '''build replay buffer, init: total_step, reward_avg'''
    total_step = 0
    if bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}):
        buffer = BufferArrayGPU(max_memo + max_step, state_dim, action_dim, if_ppo=True)  # experiment replay buffer
        with torch.no_grad():
            reward_avg = get_episode_reward(env, act_cpu, max_step, torch.device("cpu"), if_discrete)
    else:
        buffer = BufferArrayGPU(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False)
        '''initial exploration'''
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim)
        reward_avg = np.average(rewards)
        step_sum = sum(steps)

        '''pre training and hard update before training loop'''
        buffer.update_pointer_before_sample()
        agent.update_policy(buffer, max_step, batch_size, repeat_times)
        if 'act_target' in dir(agent):
            agent.act_target.load_state_dict(agent.act.state_dict())

        q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0))  # q_i_eva n.
        total_step += step_sum

    '''training loop'''
    if_train = True
    if_solve = False
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma)
        reward_avg = np.average(rewards) if len(rewards) else reward_avg
        step_sum = sum(steps)
        total_step += step_sum

        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.update_pointer_before_sample()
        loss_a_avg, loss_c_avg = agent.update_policy(buffer, max_step, batch_size, repeat_times)

        '''saves the agent with max reward'''
        act_cpu.load_state_dict(agent.act.state_dict())
        q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg))  # q_i_eva n.

        if q_o_eva.qsize() > 0:
            if_solve = q_o_eva.get()  # q_o_eva n.

        '''break loop rules'''
        if_train = not ((if_stop and if_solve)
                        or total_step > max_total_step
                        or os.path.exists(f'{cwd}/stop'))

    env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False)
    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)

    q_i_eva.put('stop')
    while q_i_eva.qsize() > 0:
        time.sleep(1)
    time.sleep(4)
Esempio n. 8
0
def train_agent(  # 2020-11-11
        rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step,
        batch_size, repeat_times, reward_scale, gamma, break_step,
        if_break_early, show_gap, eval_times1, eval_times2,
        **_kwargs):  # 2020-09-18
    env, state_dim, action_dim, target_reward, if_discrete = build_env(
        env_name, if_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2)
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    if bool(rl_agent.__name__ in {
            'AgentPPO',
    }):
        buffer = BufferTupleOnline(max_memo)
    elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}):
        buffer = BufferArray(max_memo + max_step,
                             state_dim,
                             action_dim,
                             if_ppo=True)
    else:
        buffer = BufferArray(max_memo,
                             state_dim,
                             action_dim=1 if if_discrete else action_dim,
                             if_ppo=False)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 if_discrete, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)
        '''pre training and hard update before training loop'''
        buffer.update_pointer_before_sample()
        agent.update_policy(buffer, max_step, batch_size, repeat_times)
        if 'act_target' in dir(agent):
            agent.act_target.load_state_dict(agent.act.state_dict())
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 reward_scale, gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        loss_a, loss_c = agent.update_policy(buffer, max_step, batch_size,
                                             repeat_times)
        '''saves the agent with max reward'''
        recorder.update__record_explore(steps, rewards, loss_a, loss_c)

        if_save = recorder.update__record_evaluate(env, agent.act, max_step,
                                                   agent.device, if_discrete)
        recorder.save_act(cwd, agent.act, gpu_id) if if_save else None

        with torch.no_grad():  # for saving the GPU buffer
            if_solve = recorder.check__if_solved(target_reward, gpu_id,
                                                 show_gap, cwd)
        '''break loop rules'''
        if_train = not ((if_break_early and if_solve)
                        or recorder.total_step > break_step
                        or os.path.exists(f'{cwd}/stop'))

    recorder.save_npy__draw_plot(cwd)
    print(f'SavedDir: {cwd}\n'
          f'UsedTime: {time.time() - recorder.start_time:.0f}')

    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)