def mp__update_params(args, pipe1_eva, pipe1_exp_list): args.init_before_training(if_main=False) '''basic arguments''' env = args.env cwd = args.cwd agent = args.agent rollout_num = args.rollout_num '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times if_break_early = args.if_break_early del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' max_step = env.max_step state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''init: Agent, ReplayBuffer''' agent.init(net_dim, state_dim, action_dim) if_on_policy = agent.__class__.__name__ in {'AgentPPO', 'AgentGaePPO', 'AgentInterPPO'} '''send''' pipe1_eva.send(agent.act) # send # act = pipe2_eva.recv() # recv buffer_mp = ReplayBufferMP(max_len=max_memo + max_step * rollout_num, if_on_policy=if_on_policy, state_dim=state_dim, action_dim=1 if if_discrete else action_dim, rollout_num=rollout_num, if_gpu=True) '''prepare for training''' if if_on_policy: steps = 0 else: # explore_before_training for off-policy with torch.no_grad(): # update replay buffer steps = 0 for i in range(rollout_num): pipe1_exp = pipe1_exp_list[i] # pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len])) buf_state, buf_other = pipe1_exp.recv() steps += len(buf_state) buffer_mp.extend_buffer(buf_state, buf_other, i) agent.update_net(buffer_mp, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(env, 'act_target', None) else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(env, 'cri_target', None) in dir( agent) else None total_step = steps '''send''' pipe1_eva.send((agent.act, steps, 0, 0.5)) # send # act, steps, obj_a, obj_c = pipe2_eva.recv() # recv '''start training''' if_solve = False while not ((if_break_early and if_solve) or total_step > break_step or os.path.exists(f'{cwd}/stop')): '''update ReplayBuffer''' steps = 0 # send by pipe1_eva for i in range(rollout_num): pipe1_exp = pipe1_exp_list[i] '''send''' pipe1_exp.send(agent.act) # agent.act = pipe2_exp.recv() '''recv''' # pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len])) buf_state, buf_other = pipe1_exp.recv() steps += len(buf_state) buffer_mp.extend_buffer(buf_state, buf_other, i) total_step += steps '''update network parameters''' obj_a, obj_c = agent.update_net(buffer_mp, target_step, batch_size, repeat_times) '''saves the agent with max reward''' '''send''' pipe1_eva.send((agent.act, steps, obj_a, obj_c)) # q_i_eva_get = pipe2_eva.recv() if_solve = pipe1_eva.recv() if pipe1_eva.poll(): '''recv''' # pipe2_eva.send(if_solve) if_solve = pipe1_eva.recv() buffer_mp.print_state_norm(env.neg_state_avg if hasattr(env, 'neg_state_avg') else None, env.div_state_std if hasattr(env, 'div_state_std') else None) # 2020-12-12 '''send''' pipe1_eva.send('stop') # q_i_eva_get = pipe2_eva.recv() time.sleep(4)
def mp__update_params(args, pipe1_eva, pipe1_exp_list): agent_rl = args.agent_rl # basic arguments env = args.env cwd = args.cwd rollout_num = args.rollout_num gamma = args.gamma # training arguments net_dim = args.net_dim max_memo = args.max_memo target_step = args.target_step batch_size = args.batch_size repeat_times = args.repeat_times reward_scale = args.reward_scale break_step = args.break_step if_break_early = args.if_break_early del args # In order to show these hyper-parameters clearly, I put them above. '''init: env''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete max_step = env.max_step '''build agent''' agent = agent_rl(net_dim, state_dim, action_dim) # build AgentRL pipe1_eva.send(agent.act) # act = pipe2_eva.recv() if_on_policy = agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO'} buffer_mp = ReplayBufferMP(max_memo + max_step * rollout_num, state_dim, if_on_policy=if_on_policy, action_dim=1 if if_discrete else action_dim, rollout_num=rollout_num) # build experience replay buffer steps = 0 if not if_on_policy: with torch.no_grad(): # update replay buffer for _buffer in buffer_mp.buffers: steps += _explore_before_train(env, _buffer, target_step // rollout_num, reward_scale, gamma) agent.update_net(buffer_mp, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict(agent.act.state_dict()) if 'act_target' in dir(agent) else None total_step = steps pipe1_eva.send((agent.act, steps, 0, 0.5)) # pipe1_eva (act, steps, obj_a, obj_c) if_solve = False while not ((if_break_early and if_solve) or total_step > break_step or os.path.exists(f'{cwd}/stop')): '''update ReplayBuffer''' for i in range(rollout_num): pipe1_exp = pipe1_exp_list[i] pipe1_exp.send(agent.act) # agent.act = pipe2_exp.recv() # pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len])) buf_state, buf_other = pipe1_exp.recv() steps = len(buf_state) total_step += steps buffer_mp.extend_memo_mp(buf_state, buf_other, i) '''update network parameters''' obj_a, obj_c = agent.update_net(buffer_mp, target_step, batch_size, repeat_times) '''saves the agent with max reward''' pipe1_eva.send((agent.act, steps, obj_a, obj_c)) # pipe1_eva act_cpu if_solve = pipe1_eva.recv() if pipe1_eva.poll(): if_solve = pipe1_eva.recv() # pipe2_eva.send(if_solve) buffer_mp.print_state_norm(env.neg_state_avg if hasattr(env, 'neg_state_avg') else None, env.div_state_std if hasattr(env, 'div_state_std') else None) # 2020-12-12 pipe1_eva.send('stop') # eva_pipe stop # send to mp_evaluate_agent time.sleep(4)