def mp_evaluator(args, comm_eva, agent_id=0): args.init_before_training(if_main=False) if True: '''basic arguments''' cwd = args.cwd agent = args.agent env = args.env state_dim = env.state_dim action_dim = env.action_dim # if_discrete = env.if_discrete '''training arguments''' net_dim = args.net_dim # max_memo = args.max_memo break_step = args.break_step # batch_size = args.batch_size # target_step = args.target_step # repeat_times = args.repeat_times learning_rate = args.learning_rate if_break_early = args.if_allow_break # gamma = args.gamma # reward_scale = args.reward_scale if_per_or_gae = args.if_per_or_gae # soft_update_tau = args.soft_update_tau '''evaluating arguments''' show_gap = args.eval_gap eval_env = args.eval_env eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 del args '''init: Agent''' agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae, gpu_id=-1) agent.save_or_load_agent(cwd, if_save=False) act_cpu = agent.act.to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] del agent '''init Evaluator''' eval_env = deepcopy_or_rebuild_env(env) if eval_env is None else eval_env evaluator = Evaluator(cwd=cwd, agent_id=agent_id, device=torch.device("cpu"), env=eval_env, eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap) # build Evaluator evaluator.save_or_load_recoder(if_save=False) if_train = True with torch.no_grad(): while if_train: if_train = comm_eva.evaluate_and_save0(act_cpu, evaluator, if_break_early, break_step, cwd) print(f'| UsedTime: {time.time() - evaluator.start_time:.0f} | SavedDir: {cwd}') evaluator.save_or_load_recoder(if_save=True)
def train_and_evaluate(args): args.init_before_training() '''basic arguments''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times learning_rate = args.learning_rate if_break_early = args.if_allow_break gamma = args.gamma reward_scale = args.reward_scale if_per_or_gae = args.if_per_or_gae soft_update_tau = args.soft_update_tau '''evaluating arguments''' show_gap = args.eval_gap eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 if_vec_env = getattr(env, 'env_num', 1) > 1 env_eval = deepcopy_or_rebuild_env( env) if args.env_eval is None else args.env_eval del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) '''init: Agent, ReplayBuffer, Evaluator''' agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae) if_on_policy = getattr(agent, 'if_on_policy', False) buffer = ReplayBuffer( max_len=max_memo, state_dim=state_dim, action_dim=action_dim, if_use_per=if_per_or_gae) if if_on_policy else tuple() evaluator = Evaluator(cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap) # build Evaluator '''prepare for training''' agent.state = env.reset() total_step = 0 '''start training''' if_train = True while if_train: with torch.no_grad(): if if_on_policy: buffer_tuple1 = agent.explore_env(env, target_step, reward_scale, gamma) buffer_tuple2 = agent.prepare_buffer(buffer_tuple1) steps = buffer_tuple2[0].size(0) buffer = buffer_tuple2 else: trajectory_list = agent.explore_env(env, target_step, reward_scale, gamma) steps = len(trajectory_list) buffer.extend_buffer_from_list(trajectory_list) total_step += steps # assert if_on_policy and isinstance(buffer, tuple) # assert (not if_on_policy) and isinstance(buffer, ReplayBuffer) logging_tuple = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau) with torch.no_grad(): if_reach_goal = evaluator.evaluate_save(agent.act, steps, logging_tuple) if_train = not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop')) print( f'| UsedTime: {time.time() - evaluator.start_time:.0f} | SavedDir: {cwd}' )
def mp_evaluator(args, pipe_eva): args.init_before_training(process_id=-1) if True: '''arguments: basic''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id # worker_num = args.worker_num '''arguments: train''' # net_dim = args.net_dim # max_memo = args.max_memo break_step = args.break_step # batch_size = args.batch_size # target_step = args.target_step # repeat_times = args.repeat_times # learning_rate = args.learning_rate if_break_early = args.if_allow_break # gamma = args.gamma # reward_scale = args.reward_scale # if_per_or_gae = args.if_per_or_gae # soft_update_tau = args.soft_update_tau '''arguments: evaluate''' show_gap = args.eval_gap eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 env_eval = deepcopy_or_rebuild_env( env) if args.env_eval is None else args.env_eval '''arguments: environment''' # max_step = env.max_step # state_dim = env.state_dim # action_dim = env.action_dim # if_discrete = env.if_discrete del args # In order to show these hyper-parameters clearly, I put them above. '''init: Evaluator''' evaluator = Evaluator(cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap) # build Evaluator # pipe_eva[1].send((act_cpu, steps)) act_cpu, steps = pipe_eva[0].recv() '''start training''' sum_step = steps if_train = True while if_train: # pipe_eva[1].send((act_state_dict, steps, logging_tuple)) act_state_dict, steps, logging_tuple = pipe_eva[0].recv() sum_step += steps if act_state_dict is not None: act_cpu.load_state_dict(act_state_dict) if_reach_goal = evaluator.evaluate_save(act_cpu, sum_step, logging_tuple) sum_step = 0 if_train = not ((if_break_early and if_reach_goal) or evaluator.total_step > break_step or os.path.exists(f'{cwd}/stop')) print(f'| SavedDir: {cwd}\n' f'| UsedTime: {time.time() - evaluator.start_time:.0f}') pipe_eva[0].send(if_train)
def train_and_evaluate(args, agent_id=0): args.init_before_training(if_main=True) if True: '''basic arguments''' cwd = args.cwd agent = args.agent env = args.env state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times learning_rate = args.learning_rate if_break_early = args.if_allow_break gamma = args.gamma reward_scale = args.reward_scale if_per_or_gae = args.if_per_or_gae soft_update_tau = args.soft_update_tau '''evaluating arguments''' show_gap = args.eval_gap eval_env = args.eval_env eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 del args '''init: Agent''' agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae) agent.save_or_load_agent(cwd, if_save=False) if_on_policy = agent.if_on_policy '''init Evaluator''' eval_env = deepcopy_or_rebuild_env(env) if eval_env is None else eval_env evaluator = Evaluator(cwd=cwd, agent_id=agent_id, device=agent.device, env=eval_env, eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap) # build Evaluator evaluator.save_or_load_recoder(if_save=False) '''init ReplayBuffer''' if if_on_policy: buffer = list() def update_buffer(_trajectory_list): buffer[:] = agent.prepare_buffer(_trajectory_list) # buffer = (state, action, r_sum, logprob, advantage) _steps = buffer[2].size(0) # buffer[2] = r_sum _r_exp = buffer[2].mean().item() # buffer[2] = r_sum return _steps, _r_exp assert isinstance(buffer, list) else: buffer = ReplayBuffer(state_dim=state_dim, action_dim=1 if if_discrete else action_dim, max_len=max_memo, if_use_per=if_per_or_gae) buffer.save_or_load_history(cwd, if_save=False) def update_buffer(_trajectory_list): _state = torch.as_tensor([item[0] for item in _trajectory_list], dtype=torch.float32) _other = torch.as_tensor([item[1] for item in _trajectory_list], dtype=torch.float32) buffer.extend_buffer(_state, _other) _steps = _other.size()[0] _r_exp = _other[:, 0].mean().item() # other = (reward, mask, ...) return _steps, _r_exp assert isinstance(buffer, ReplayBuffer) '''start training''' if if_on_policy: agent.state = env.reset() elif buffer.max_len != 0: # if_off_policy agent.state = env.reset() else: # if_off_policy with torch.no_grad(): # update replay buffer trajectory_list = explore_before_training(env, target_step, reward_scale, gamma) steps, r_exp = update_buffer(trajectory_list) agent.state = trajectory_list[-1][0] # trajectory_list[-1][0] = (state, other)[0] = state agent.update_net(buffer, target_step, batch_size, repeat_times) agent.act_target.load_state_dict(agent.act.state_dict()) if agent.if_use_act_target else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if agent.if_use_cri_target else None evaluator.total_step += steps if_train = True while if_train: with torch.no_grad(): trajectory_list = agent.explore_env(env, target_step, reward_scale, gamma) steps, r_exp = update_buffer(trajectory_list) logging_tuple = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau) with torch.no_grad(): if_reach_goal = evaluator.evaluate_and_save(agent.act, steps, r_exp, logging_tuple) if_train = not ((if_break_early and if_reach_goal) or evaluator.total_step > break_step or os.path.exists(f'{cwd}/stop')) print(f'| UsedTime: {time.time() - evaluator.start_time:.0f} | SavedDir: {cwd}') agent.save_or_load_agent(cwd, if_save=True) buffer.save_or_load_history(cwd, if_save=True) if not if_on_policy else None evaluator.save_or_load_recoder(if_save=True)
def train_and_evaluate(args): args.init_before_training() '''basic arguments''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times learning_rate = args.learning_rate if_break_early = args.if_allow_break gamma = args.gamma reward_scale = args.reward_scale if_per_or_gae = args.if_per_or_gae soft_update_tau = args.soft_update_tau '''evaluating arguments''' show_gap = args.eval_gap eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 if_vec_env = getattr(env, 'env_num', 1) > 1 env_eval = deepcopy_or_rebuild_env( env) if args.env_eval is None else args.env_eval del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''init: Agent, ReplayBuffer, Evaluator''' agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae) if_on_policy = agent.if_on_policy '''init: ReplayBuffer''' agent.state = env.reset() buffer = ReplayBuffer( max_len=target_step if if_on_policy else max_memo, if_on_policy=if_on_policy, if_per_or_gae=if_per_or_gae, state_dim=state_dim, action_dim=action_dim, if_discrete=if_discrete, ) if if_on_policy: steps = 0 else: # explore_before_training for off-policy with torch.no_grad(): # update replay buffer trajectory_list, state = explore_before_training( env, target_step, reward_scale, gamma) agent.state = state steps = len(trajectory_list) buffer.extend_buffer_from_list(trajectory_list) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update # hard update for the first time agent.act_target.load_state_dict(agent.act.state_dict()) if getattr( agent, 'act_target', None) else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr( agent, 'cri_target', None) else None total_step = steps '''init: Evaluator''' evaluator = Evaluator(cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap) # build Evaluator '''start training''' if_train = True if if_vec_env: while if_train: with torch.no_grad(): buffer = agent.explore_envs(env, target_step, reward_scale, gamma) steps = buffer[0].size(0) * buffer[0].size(1) total_step += steps buffer = agent.prepare_buffers(buffer) logging_tuple = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau) with torch.no_grad(): # speed up running if_reach_goal = evaluator.evaluate_save( agent.act, steps, logging_tuple) if_train = not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop')) else: while if_train: with torch.no_grad(): trajectory_list = agent.explore_env(env, target_step, reward_scale, gamma) steps = len(trajectory_list) total_step += steps buffer.extend_buffer_from_list(trajectory_list) logging_tuple = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau) with torch.no_grad(): # speed up running if_reach_goal = evaluator.evaluate_save( agent.act, steps, logging_tuple) if_train = not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop'))