def run(comm, env, policy, critic, critic_opt, critic_target, policy_path, action_bound, optimizer): buff = [] global_update = 0 global_step = 0 # world reset if env.index == 0: env.reset_world() memory_position = 0 update = 0 # replay_memory replay_memory = ReplayMemory(REPLAY_SIZE, SEED) for id in range(MAX_EPISODES): # reset env.reset_pose() terminal = False ep_reward = 0 step = 1 # generate goal env.generate_goal_point() # get_state obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] # episode 1 while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) ## get_action #------------------------------------------------------------------------- # generate actions at rank==0 a, logprob, scaled_action = generate_action( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions #------------------------------------------------------------------------- real_action = comm.scatter(scaled_action, root=0) ## run action #------------------------------------------------------------------------- env.control_vel(real_action) rospy.sleep(0.001) ## get reward #------------------------------------------------------------------------- # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 #------------------------------------------------------------------------- # get next state #------------------------------------------------------------------------- s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] #------------------------------------------------------------------------- r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) state_next_list = comm.gather(state_next, root=0) ## training #------------------------------------------------------------------------- if env.index == 0: # add data in replay_memory #------------------------------------------------------------------------- replay_memory.push(state[0], state[1], state[2], a, logprob, r_list, state_next[0], state_next[1], state_next[2], terminal_list) if len(replay_memory) > BATCH_SIZE: ## update #------------------------------------------------------------------------- update = sac_update_stage(policy=policy, optimizer=optimizer, critic=critic, critic_opt=critic_opt, critic_target=critic_target, batch_size=BATCH_SIZE, memory=replay_memory, epoch=EPOCH, replay_size=REPLAY_SIZE, tau=TAU, alpha=ALPHA, gamma=GAMMA, updates=update, update_interval=UPDATE_INTERVAL, num_step=BATCH_SIZE, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 update = update step += 1 state = state_next ## save policy #-------------------------------------------------------------------------------------------------------------- if env.index == 0: if global_update != 0 and global_update % 1000 == 0: torch.save(policy.state_dict(), policy_path + '/policy_{}'.format(global_update)) torch.save(critic.state_dict(), policy_path + '/critic_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): actor, actor_target, critic, critic_target = policy actor_opt, critic_opt = optimizer # set OU noise noise_c = OUNoise(action_dim=ACT_SIZE, action_bound=action_bound) noise_c.reset() buff = [] global_update = 0 global_step = 0 replay_memory = ReplayMemory(REPLAY_SIZE, SEED) #world reset if env.index == 0: env.reset_world() for id in range(MAX_EPISODES): #reset env.reset_pose() terminal = False ep_reward = 0 step = 1 # generate goal env.generate_goal_point() # get_state obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] # episode 1 while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) ## get_action #------------------------------------------------------------------------- # generate actions at rank==0 mean, action = generate_action(env=env, state_list=state_list, actor=actor, action_bound=action_bound) ## exploration #-------------------------------------------------------------------------- a = noise_c.get_action(mean, step) #noise = np.random.normal(0, exploration_noise, size=2) #action size check #a = mean + noise #a = a.clip(action_bound[0], action_bound[1]) # execute actions #------------------------------------------------------------------------- real_action = comm.scatter(a, root=0) ## run action #------------------------------------------------------------------------- env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) ## get reward #------------------------------------------------------------------------- # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 #------------------------------------------------------------------------- # get next state #------------------------------------------------------------------------- s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) state_next_list = comm.gather(state_next, root=0) #------------------------------------------------------------------------- ## training #------------------------------------------------------------------------- if env.index == 0: ## save data in memory #------------------------------------------------------------------------- replay_memory.push(state[0], state[1], state[2], a, r_list, state_next[0], state_next[1], state_next[2], terminal_list) policy_list = [actor, actor_target, critic, critic_target] optimizer_list = [actor_opt, critic_opt] if len(replay_memory) > BATCH_SIZE: ## Update step by step #------------------------------------------------------------------------- ddpg_update_stage(policy=policy_list, optimizer=optimizer_list, batch_size=BATCH_SIZE, memory=replay_memory, epoch=EPOCH, replay_size=REPLAY_SIZE, gamma=GAMMA, num_step=BATCH_SIZE, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE, tau=TAU) global_update += 1 step += 1 state = state_next ## save policy and log #------------------------------------------------------------------------- if env.index == 0: if global_update != 0 and global_update % 3000 == 0: torch.save(actor.state_dict(), policy_path + '/actor_{}'.format(global_update)) torch.save(critic.state_dict(), policy_path + '/critic_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)
def run(comm, env, agent, policy_path, args): # Training Loop test_interval = 10 save_interval = 100 total_numsteps = 0 updates = 0 # world reset if env.index == 0: # step env.reset_gazebo_simulation() #Tesnorboard writer = SummaryWriter('test_runs/' + policy_path) # replay_memory memory = ReplayMemory(args.replay_size, args.seed) for i_episode in range(args.num_steps): episode_reward = 0 episode_steps = 0 done = False # Env reset env.set_gazebo_pose() # generate goal env.generate_goal_point_gazebo() # Get initial state frame = env.get_laser_observation() frame_stack = deque([frame, frame, frame]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [frame_stack, goal, speed] # Episode start while not done and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) if env.index == 0: action = agent.select_action(state_list) else: action = None if env.index == 0: if len(memory) > args.batch_size: # Number of updates per step in environment for i in range(args.updates_per_step): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates) writer.add_scalar('loss/critic_1', critic_1_loss, updates) writer.add_scalar('loss/critic_2', critic_2_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) writer.add_scalar('loss/entropy_loss', ent_loss, updates) writer.add_scalar('entropy_temprature/alpha', alpha, updates) updates += 1 # Execute actions #------------------------------------------------------------------------- action_clip_bound = [[0, -1], [1, 1]] #### Action maximum, minimum values cliped_action = np.clip(action, a_min=action_clip_bound[0], a_max=action_clip_bound[1]) real_action = comm.scatter(cliped_action, root=0) #if real_action[0] >= 0.02 and real_action[0] <= 0.2: # real_action[0] = real_action[0] / 0.6 if real_action[0] < 0.10 and real_action[0] > 0: real_action[0] = 0.1 if real_action[1] > 0 and real_action[1] < 0.15: real_action[1] = 0.0 elif real_action[1] < 0 and real_action[1] > -0.15: real_action[1] = 0.0 env.control_vel(real_action) #rospy.sleep(0.001) ## Get reward and terminal state reward, done, result = env.get_reward_and_terminate(episode_steps) #print("Action : [{}, {}], Distance : {}, Reward : {}".format(real_action[0], real_action[1], env.distance, reward)) episode_steps += 1 total_numsteps += 1 episode_reward += reward # Get next state next_frame = env.get_laser_observation() left = frame_stack.popleft() frame_stack.append(next_frame) next_goal = np.asarray(env.get_local_goal()) next_speed = np.asarray(env.get_self_speed()) next_state = [frame_stack, next_goal, next_speed] r_list = comm.gather(reward, root=0) done_list = comm.gather(done, root=0) next_state_list = comm.gather(next_state, root=0) if env.index == 0: #meomry.list_push(state_list, action, r_list, next_state_list, done_list) for i in range(np.asarray(state_list).shape[0]): memory.push(state_list[i][0], state_list[i][1], state_list[i][2], action[i], r_list[i], next_state_list[i][0], next_state_list[i][1], next_state_list[i][2], done_list[i]) # Append transition to memory state = next_state #if total_numsteps > args.num_steps: # break if env.index == 0: writer.add_scalar('reward/train', episode_reward, i_episode) if env.index == 0: #if global_update != 0 and global_update % 20 == 0: if i_episode != 0 and i_episode % save_interval == 0: torch.save(agent.policy.state_dict(), policy_path + '/policy_epi_{}'.format(i_episode)) print('########################## policy model saved when update {} times#########' '################'.format(i_episode)) torch.save(agent.critic_1.state_dict(), policy_path + '/critic_1_epi_{}'.format(i_episode)) print('########################## critic model saved when update {} times#########' '################'.format(i_episode)) torch.save(agent.critic_2.state_dict(), policy_path + '/critic_2_epi_{}'.format(i_episode)) print('########################## critic model saved when update {} times#########' '################'.format(i_episode)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) print("Env: {}, memory_size: {}, Goal: ({} , {}), Episode: {}, steps: {}, Reward: {}, Distance: {}, {}".format(env.index, len(memory), round(env.goal_point[0],2), round(env.goal_point[1],2), i_episode+1, episode_steps, round(episode_reward, 2), round(distance, 2), result))