def process__buffer(q_aggr, qs_dist, args, **_kwargs): max_memo = args.max_memo env_name = args.env_name max_step = args.max_step batch_size = args.batch_size repeat_times = 2 # reward_scale = args.reward_scale # gamma = args.gamma '''init''' env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=False) buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer workers_num = len(qs_dist) '''loop''' is_training = True while is_training: for i in range(workers_num): memo_array, is_solved = q_aggr.get() buffer.extend_memo(memo_array) if is_solved: is_training = False buffer.init_before_sample() for i in range(max_step * repeat_times): # batch_arrays = buffer.random_sample(batch_size, device=None) # faster but worse for q_dist in qs_dist: batch_arrays = buffer.random_sample(batch_size, device=None) # slower but better q_dist.put(batch_arrays) print('|| Exit: process__buffer')
def process__buffer(q_aggr, qs_dist, args, **_kwargs): max_memo = args.max_memo env_name = args.env_name max_step = args.max_step batch_size = args.batch_size repeat_times = 2 reward_scale = args.reward_scale gamma = args.gamma '''init''' env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info( env, be_quiet=False) buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer workers_num = len(qs_dist) '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer( # env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) while True: for _ in range(workers_num): memo_array = q_aggr.get() buffer.extend_memo(memo_array) buffer.init_before_sample() for _ in range(max_step * repeat_times): for q_dist in qs_dist: batch_arrays = buffer.random_sample(batch_size, device=None) q_dist.put(batch_arrays)