def mp_explore_in_env(args, pipe2_exp, worker_id): env = args.env reward_scale = args.reward_scale gamma = args.gamma random_seed = args.random_seed agent_rl = args.agent_rl net_dim = args.net_dim max_memo = args.max_memo target_step = args.target_step rollout_num = args.rollout_num del args torch.manual_seed(random_seed + worker_id) np.random.seed(random_seed + worker_id) '''init: env''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete max_step = env.max_step '''build agent''' agent = agent_rl(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() # agent.device = torch.device('cpu') # env_cpu--act_cpu a little faster than env_cpu--act_gpu, but high cpu-util '''build replay buffer, init: total_step, reward_avg''' if_on_policy = bool( agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO', 'AgentInterPPO'}) buffer = ReplayBuffer(max_memo // rollout_num + max_step, state_dim, if_on_policy=if_on_policy, action_dim=1 if if_discrete else action_dim) # build experience replay buffer exp_step = target_step // rollout_num with torch.no_grad(): while True: # pipe1_exp.send(agent.act) agent.act = pipe2_exp.recv() agent.update_buffer(env, buffer, exp_step, reward_scale, gamma) buffer.update__now_len__before_sample() pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len]))
def train_and_evaluate(args): args.init_before_training() cwd = args.cwd env = args.env env_eval = args.env_eval agent_id = args.gpu_id agent_rl = args.agent_rl # basic arguments gamma = args.gamma # training arguments net_dim = args.net_dim max_memo = args.max_memo target_step = args.target_step batch_size = args.batch_size repeat_times = args.repeat_times reward_scale = args.reward_scale if_per = args.if_per show_gap = args.show_gap # evaluate arguments eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 break_step = args.break_step if_break_early = args.if_break_early env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) del args # In order to show these hyper-parameters clearly, I put them above. '''init: env''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete max_step = env.max_step env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) '''init: Agent, Evaluator, ReplayBuffer''' agent = agent_rl(net_dim, state_dim, action_dim) # build AgentRL agent.state = env.reset() evaluator = Evaluator(cwd=cwd, agent_id=agent_id, device=agent.device, env=env_eval, eval_times1=eval_times1, eval_times2=eval_times2, show_gap=show_gap) # build Evaluator if_on_policy = agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO'} buffer = ReplayBuffer(max_memo + max_step, state_dim, if_on_policy=if_on_policy, if_per=if_per, action_dim=1 if if_discrete else action_dim) # build experience replay buffer if if_on_policy: steps = 0 else: with torch.no_grad(): # update replay buffer steps = _explore_before_train(env, buffer, target_step, reward_scale, gamma) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict( agent.act.state_dict()) if 'act_target' in dir(agent) else None total_step = steps if_solve = False while not ((if_break_early and if_solve) or total_step > break_step or os.path.exists(f'{cwd}/stop')): with torch.no_grad(): # speed up running steps = agent.update_buffer(env, buffer, target_step, reward_scale, gamma) total_step += steps obj_a, obj_c = agent.update_net(buffer, target_step, batch_size, repeat_times) with torch.no_grad(): # speed up running if_solve = evaluator.evaluate_act__save_checkpoint( agent.act, steps, obj_a, obj_c)