def mp__update_params(args, q_i_buf, q_o_buf, q_i_eva, q_o_eva): # update network parameters using replay buffer class_agent = args.rl_agent max_memo = args.max_memo net_dim = args.net_dim max_step = args.max_step max_total_step = args.max_total_step batch_size = args.batch_size repeat_times = args.repeat_times del args state_dim, action_dim = q_o_buf.get() # q_o_buf 1. agent = class_agent(state_dim, action_dim, net_dim) from copy import deepcopy act_cpu = deepcopy(agent.act).to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] q_i_buf.put(act_cpu) # q_i_buf 1. # q_i_buf.put(act_cpu) # q_i_buf 2. # warning q_i_eva.put(act_cpu) # q_i_eva 1. buffer = BufferArrayGPU(max_memo, state_dim, action_dim) # experiment replay buffer '''initial_exploration''' buffer_array, reward_list, step_list = q_o_buf.get() # q_o_buf 2. reward_avg = np.average(reward_list) step_sum = sum(step_list) buffer.extend_memo(buffer_array) q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0)) # q_i_eva 1. total_step = step_sum is_training = True while is_training: buffer_array, reward_list, step_list = q_o_buf.get() # q_o_buf n. reward_avg = np.average(reward_list) step_sum = sum(step_list) total_step += step_sum buffer.extend_memo(buffer_array) buffer.init_before_sample() loss_a_avg, loss_c_avg = agent.update_parameters(buffer, max_step, batch_size, repeat_times) act_cpu.load_state_dict(agent.act.state_dict()) q_i_buf.put(act_cpu) # q_i_buf n. q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg)) # q_i_eva n. if q_o_eva.qsize() > 0: is_solved = q_o_eva.get() # q_o_eva n. if is_solved: is_training = False if total_step > max_total_step: is_training = False q_i_buf.put('stop') q_i_eva.put('stop') while q_i_buf.qsize() > 0 or q_i_eva.qsize() > 0: time.sleep(1) time.sleep(4) print('; quit: params')
def mp__update_params(args, q_i_buf, q_o_buf, q_i_eva, q_o_eva): # update network parameters using replay buffer class_agent = args.class_agent max_memo = args.max_memo net_dim = args.net_dim max_epoch = args.max_epoch max_step = args.max_step batch_size = args.batch_size repeat_times = args.repeat_times args.init_for_training() del args state_dim, action_dim = q_o_buf.get() # q_o_buf 1. agent = class_agent(state_dim, action_dim, net_dim) from copy import deepcopy act_cpu = deepcopy(agent.act).to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] q_i_buf.put(act_cpu) # q_i_buf 1. # q_i_buf.put(act_cpu) # q_i_buf 2. # warning q_i_eva.put(act_cpu) # q_i_eva 1. buffer = BufferArrayGPU(max_memo, state_dim, action_dim) # experiment replay buffer '''initial_exploration''' buffer_array, reward_list, step_list = q_o_buf.get() # q_o_buf 2. reward_avg = np.average(reward_list) step_sum = sum(step_list) buffer.extend_memo(buffer_array) q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0)) # q_i_eva 1. for epoch in range(max_epoch): # epoch is episode buffer_array, reward_list, step_list = q_o_buf.get() # q_o_buf n. reward_avg = np.average(reward_list) step_sum = sum(step_list) buffer.extend_memo(buffer_array) buffer.init_before_sample() loss_a_avg, loss_c_avg = agent.update_parameters(buffer, max_step, batch_size, repeat_times) act_cpu.load_state_dict(agent.act.state_dict()) q_i_buf.put(act_cpu) # q_i_buf n. q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg)) # q_i_eva n. if q_o_eva.qsize() > 0: is_solved = q_o_eva.get() # q_o_eva n. if is_solved: break
def mp__update_params(args, q_i_buf, q_o_buf, q_i_eva, q_o_eva): # update network parameters using replay buffer class_agent = args.rl_agent max_memo = args.max_memo net_dim = args.net_dim max_step = args.max_step max_total_step = args.break_step batch_size = args.batch_size repeat_times = args.repeat_times cwd = args.cwd env_name = args.env_name if_stop = args.if_break_early del args state_dim, action_dim = q_o_buf.get() # q_o_buf 1. agent = class_agent(state_dim, action_dim, net_dim) from copy import deepcopy act_cpu = deepcopy(agent.act).to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] q_i_buf.put(act_cpu) # q_i_buf 1. q_i_eva.put(act_cpu) # q_i_eva 1. buffer = BufferArrayGPU(max_memo, state_dim, action_dim) # experiment replay buffer '''initial_exploration''' buffer_array, reward_list, step_list = q_o_buf.get() # q_o_buf 2. reward_avg = np.average(reward_list) step_sum = sum(step_list) buffer.extend_memo(buffer_array) '''pre training and hard update before training loop''' buffer.init_before_sample() agent.update_parameters(buffer, max_step, batch_size, repeat_times) agent.act_target.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0)) # q_i_eva 1. total_step = step_sum if_train = True if_solve = False while if_train: buffer_array, reward_list, step_list = q_o_buf.get() # q_o_buf n. reward_avg = np.average(reward_list) step_sum = sum(step_list) total_step += step_sum buffer.extend_memo(buffer_array) buffer.init_before_sample() loss_a_avg, loss_c_avg = agent.update_parameters(buffer, max_step, batch_size, repeat_times) act_cpu.load_state_dict(agent.act.state_dict()) q_i_buf.put(act_cpu) # q_i_buf n. q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg)) # q_i_eva n. if q_o_eva.qsize() > 0: if_solve = q_o_eva.get() # q_o_eva n. '''break loop rules''' if_train = not ((if_stop and if_solve) or total_step > max_total_step or os.path.exists(f'{cwd}/stop')) env, state_dim, action_dim, target_reward, if_discrete = build_gym_env(env_name, if_print=False) buffer.print_state_norm(env.neg_state_avg, env.div_state_std) q_i_buf.put('stop') q_i_eva.put('stop') while q_i_buf.qsize() > 0 or q_i_eva.qsize() > 0: time.sleep(1) time.sleep(4)
def mp__update_params(args, q_i_eva, q_o_eva): # 2020-11-11 update network parameters using replay buffer rl_agent = args.rl_agent max_memo = args.max_memo net_dim = args.net_dim max_step = args.max_step max_total_step = args.break_step batch_size = args.batch_size repeat_times = args.repeat_times cwd = args.cwd env_name = args.env_name reward_scale = args.reward_scale if_stop = args.if_break_early gamma = args.gamma del args env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False) '''build agent''' agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() '''send agent to q_i_eva''' from copy import deepcopy act_cpu = deepcopy(agent.act).to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] q_i_eva.put(act_cpu) # q_i_eva 1. '''build replay buffer, init: total_step, reward_avg''' total_step = 0 if bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}): buffer = BufferArrayGPU(max_memo + max_step, state_dim, action_dim, if_ppo=True) # experiment replay buffer with torch.no_grad(): reward_avg = get_episode_reward(env, act_cpu, max_step, torch.device("cpu"), if_discrete) else: buffer = BufferArrayGPU(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False) '''initial exploration''' with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) reward_avg = np.average(rewards) step_sum = sum(steps) '''pre training and hard update before training loop''' buffer.update_pointer_before_sample() agent.update_policy(buffer, max_step, batch_size, repeat_times) if 'act_target' in dir(agent): agent.act_target.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0)) # q_i_eva n. total_step += step_sum '''training loop''' if_train = True if_solve = False while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) reward_avg = np.average(rewards) if len(rewards) else reward_avg step_sum = sum(steps) total_step += step_sum '''update network parameters by random sampling buffer for gradient descent''' buffer.update_pointer_before_sample() loss_a_avg, loss_c_avg = agent.update_policy(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' act_cpu.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg)) # q_i_eva n. if q_o_eva.qsize() > 0: if_solve = q_o_eva.get() # q_o_eva n. '''break loop rules''' if_train = not ((if_stop and if_solve) or total_step > max_total_step or os.path.exists(f'{cwd}/stop')) env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False) buffer.print_state_norm(env.neg_state_avg, env.div_state_std) q_i_eva.put('stop') while q_i_eva.qsize() > 0: time.sleep(1) time.sleep(4)