Esempio n. 1
0
def process__buffer(q_aggr, qs_dist, args, **_kwargs):
    max_memo = args.max_memo
    env_name = args.env_name
    max_step = args.max_step
    batch_size = args.batch_size
    repeat_times = 2

    # reward_scale = args.reward_scale
    # gamma = args.gamma

    '''init'''
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=False)
    buffer = BufferArray(max_memo, state_dim, action_dim)  # experiment replay buffer

    workers_num = len(qs_dist)

    '''loop'''
    is_training = True
    while is_training:
        for i in range(workers_num):
            memo_array, is_solved = q_aggr.get()
            buffer.extend_memo(memo_array)
            if is_solved:
                is_training = False

        buffer.init_before_sample()
        for i in range(max_step * repeat_times):
            # batch_arrays = buffer.random_sample(batch_size, device=None) # faster but worse
            for q_dist in qs_dist:
                batch_arrays = buffer.random_sample(batch_size, device=None)  # slower but better
                q_dist.put(batch_arrays)

    print('|| Exit: process__buffer')
Esempio n. 2
0
def process__buffer(q_aggr, qs_dist, args, **_kwargs):
    max_memo = args.max_memo
    env_name = args.env_name
    max_step = args.max_step
    batch_size = args.batch_size
    repeat_times = 2

    reward_scale = args.reward_scale
    gamma = args.gamma
    '''init'''
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(
        env, be_quiet=False)
    buffer = BufferArray(max_memo, state_dim,
                         action_dim)  # experiment replay buffer

    workers_num = len(qs_dist)
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(
        #     env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)

    while True:
        for _ in range(workers_num):
            memo_array = q_aggr.get()
            buffer.extend_memo(memo_array)

        buffer.init_before_sample()
        for _ in range(max_step * repeat_times):
            for q_dist in qs_dist:
                batch_arrays = buffer.random_sample(batch_size, device=None)
                q_dist.put(batch_arrays)