def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config self.outputdir = get_output_folder(self.config.output, self.config.env) self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) print(self.env.action_space.low, self.env.action_space.high)
def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config # Linear epsilon decay epsilon_final = self.config.epsilon_min epsilon_start = self.config.epsilon eps_steps = self.config.eps_fraction * float(self.config.frames) self.epsilon_by_frame = lambda frame_idx: epsilon_start + (min(1.0, float(frame_idx) / eps_steps)) \ * (epsilon_final - epsilon_start) self.outputdir = get_output_folder(self.config.output, self.config.env) self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir)
def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config # non-Linear epsilon decay epsilon_final = self.config.epsilon_min epsilon_start = self.config.epsilon epsilon_decay = self.config.eps_decay self.epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) self.outputdir = get_output_folder(self.config.output, self.config.env) self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir)
def __init__(self, agent, config: Config, record=False): self.agent = agent self.config = config self.outputdir = get_output_folder() # if record: # os.makedirs('video', exist_ok=True) # filepath = self.outputdir + '/video/' + config.env + '-' + time_seq() # env = wrappers.Monitor(env, filepath, # video_callable=lambda episode_id: episode_id % self.config.record_ep_interval == 0) # self.env = env # self.env.seed(config.seed) self.agent.is_training = True self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir)
class Trainer: def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config # non-Linear epsilon decay epsilon_final = self.config.epsilon_min epsilon_start = self.config.epsilon epsilon_decay = self.config.eps_decay self.epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay) self.outputdir = get_output_folder(self.config.output, self.config.env) # print("outputdir:", self.outputdir) # input() self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) def train(self, pre_fr=0): losses = [] all_rewards = [] episode_reward = 0 ep_num = 0 is_win = False state = self.env.reset() for fr in range(pre_fr + 1, self.config.frames + 1): epsilon = self.epsilon_by_frame(fr) action = self.agent.act(state, epsilon) next_state, reward, done, _ = self.env.step(action) self.agent.buffer.add(state, action, reward, next_state, done) state = next_state episode_reward += reward loss = 0 if self.agent.buffer.size() > self.config.batch_size: loss = self.agent.learning(fr) losses.append(loss) self.board_logger.scalar_summary('Loss per frame', fr, loss) if fr % self.config.print_interval == 0: print("frames: %5d, reward: %5f, loss: %4f episode: %4d" % (fr, np.mean(all_rewards[-10:]), loss, ep_num)) if fr % self.config.log_interval == 0: self.board_logger.scalar_summary('Reward per episode', ep_num, all_rewards[-1]) if self.config.checkpoint and fr % self.config.checkpoint_interval == 0: self.agent.save_checkpoint(fr, self.outputdir) if done: state = self.env.reset() all_rewards.append(episode_reward) episode_reward = 0 ep_num += 1 avg_reward = float(np.mean(all_rewards[-100:])) self.board_logger.scalar_summary('Best 100-episodes average reward', ep_num, avg_reward) if len(all_rewards) >= 100 and avg_reward >= self.config.win_reward and all_rewards[-1] > self.config.win_reward: is_win = True self.agent.save_model(self.outputdir, 'best') print('Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔' % (ep_num, avg_reward, ep_num - 100)) if self.config.win_break: break if not is_win: print('Did not solve after %d episodes' % ep_num) self.agent.save_model(self.outputdir, 'last')
class Trainer: def __init__(self, agent, config: Config, record=False): self.agent = agent self.config = config self.outputdir = get_output_folder() # if record: # os.makedirs('video', exist_ok=True) # filepath = self.outputdir + '/video/' + config.env + '-' + time_seq() # env = wrappers.Monitor(env, filepath, # video_callable=lambda episode_id: episode_id % self.config.record_ep_interval == 0) # self.env = env # self.env.seed(config.seed) self.agent.is_training = True self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) async def train(self, pre_episodes=0, pre_total_step=0): total_step = pre_total_step all_rewards = [] result_dir = os.path.join('./logs/', util.now_str()) os.makedirs(result_dir, exist_ok=True) header = ["num_episode", "total_reward", "episode_length"] recorder = util.RecordHistory(os.path.join(result_dir, "history.csv"), header) recorder.generate_csv() for ep in range(pre_episodes + 1, self.config.episodes + 1): await util.sendCommand(util.COMMAND_MAP[util.Commands.RESET.value]) s0 = await util.getState() # s0 = self.env.reset() # self.agent.reset() done = False step = 0 actor_loss, critics_loss, reward = 0, 0, 0 done_count = 0 # decay noise self.agent.decay_epsilon() while done_count < 100: action = self.agent.get_action(s0) # translate action to motor speed here lms = int(action[0] * 127) rms = int(action[1] * 127) s1, r1, done, _ = await util.getNextState(lms, rms) # s1, r1, done = self.env.step(action) if done: done_count += 1 self.agent.buffer.add(s0, action, r1, done, s1) s0 = s1 if self.agent.buffer.size() > self.config.batch_size: loss_a, loss_c = self.agent.learning() actor_loss += loss_a critics_loss += loss_c reward += r1 step += 1 total_step += 1 if step + 1 > self.config.max_steps: break all_rewards.append(reward) avg_reward = float(np.mean(all_rewards[-100:])) self.board_logger.scalar_summary('Reward per episode', ep, all_rewards[-1]) self.board_logger.scalar_summary( 'Best 100-episodes average reward', ep, avg_reward) print( 'total step: %5d, episodes %3d, episode_step: %5d, episode_reward: %5f' % (total_step, ep, step, reward)) history = { "num_episode": ep, "total_reward": reward, "episode_length": step, } recorder.add_histry(history) # check point if self.config.checkpoint and ep % self.config.checkpoint_interval == 0: self.agent.save_checkpoint(ep, total_step, self.outputdir) # save model at last self.agent.save_model(self.outputdir) asyncio.get_event_loop().stop()
class Trainer: def __init__(self, agent, env, config: Config, record=False): self.agent = agent self.config = config self.outputdir = get_output_folder(self.config.output, self.config.env) if record: os.makedirs('video', exist_ok=True) filepath = self.outputdir + '/video/' + config.env + '-' + time_seq( ) env = wrappers.Monitor(env, filepath, video_callable=lambda episode_id: episode_id % self.config.record_ep_interval == 0) self.env = env self.env.seed(config.seed) self.agent.is_training = True self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) def train(self, pre_episodes=0, pre_total_step=0): total_step = pre_total_step all_rewards = [] for ep in range(pre_episodes + 1, self.config.episodes + 1): s0 = self.env.reset() self.agent.reset() done = False step = 0 actor_loss, critics_loss, reward = 0, 0, 0 # decay noise self.agent.decay_epsilon() while not done: action = self.agent.get_action(s0) s1, r1, done, info = self.env.step(action) self.agent.buffer.add(s0, action, r1, done, s1) s0 = s1 if self.agent.buffer.size() > self.config.batch_size: loss_a, loss_c = self.agent.learning() actor_loss += loss_a critics_loss += loss_c reward += r1 step += 1 total_step += 1 if step + 1 > self.config.max_steps: break all_rewards.append(reward) avg_reward = float(np.mean(all_rewards[-100:])) self.board_logger.scalar_summary('Reward per episode', ep, all_rewards[-1]) self.board_logger.scalar_summary( 'Best 100-episodes average reward', ep, avg_reward) print( 'total step: %5d, episodes %3d, episode_step: %5d, episode_reward: %5f' % (total_step, ep, step, reward)) # check point if self.config.checkpoint and ep % self.config.checkpoint_interval == 0: self.agent.save_checkpoint(ep, total_step, self.outputdir) # save model at last self.agent.save_model(self.outputdir)
class Trainer: def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config # non-Linear epsilon decay epsilon_final = self.config.epsilon_min epsilon_start = self.config.epsilon epsilon_decay = self.config.eps_decay self.epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) self.outputdir = get_output_folder(self.config.output, self.config.env) self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) print(self.outputdir) def train(self, pre_fr=0): losses = [] all_rewards = [] episode_reward = 0 ep_num = 0 is_win = False start = time.time() state = self.env.reset() for fr in range(pre_fr + 1, self.config.frames + 1): if fr % self.config.gif_interval >= 1 and fr % self.config.gif_interval <= 200: if fr % self.config.gif_interval == 1: frames = [] img = state[0, 0:3].transpose(1, 2, 0).astype('uint8') frames.append(Image.fromarray(img).convert('RGB')) if fr % self.config.gif_interval == 200: imageio.mimsave('record.gif', frames, 'GIF', duration=0.1) epsilon = self.epsilon_by_frame(fr) action = self.agent.act(state, epsilon) next_state, reward, done, _ = self.env.step(action) self.agent.buffer.add(state, action, reward, next_state, done) state = next_state episode_reward += reward loss = 0 if fr > self.config.init_buff and fr % self.config.learning_interval == 0: loss = self.agent.learning(fr) losses.append(loss) self.board_logger.scalar_summary('Loss per frame', fr, loss) if fr % self.config.print_interval == 0: print( "TIME {} num timesteps {}, FPS {} \n Loss {:.3f}, avrage reward {:.1f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start)), fr, int(fr / (time.time() - start)), loss, np.mean(all_rewards[-10:]))) if fr % self.config.log_interval == 0: self.board_logger.scalar_summary('Reward per episode', ep_num, all_rewards[-1]) if self.config.checkpoint and fr % self.config.checkpoint_interval == 0: self.agent.save_checkpoint(fr, self.outputdir) if done: state = self.env.reset() all_rewards.append(episode_reward) episode_reward = 0 ep_num += 1 avg_reward = float(np.mean(all_rewards[-100:])) self.board_logger.scalar_summary( 'Best 100-episodes average reward', ep_num, avg_reward) if len( all_rewards ) >= 100 and avg_reward >= self.config.win_reward and all_rewards[ -1] > self.config.win_reward: is_win = True self.agent.save_model(self.outputdir, 'best') print( 'Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔' % (ep_num, avg_reward, ep_num - 100)) if self.config.win_break: break if not is_win: print('Did not solve after %d episodes' % ep_num) self.agent.save_model(self.outputdir, 'last')
class Trainer: def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config self.outputdir = get_output_folder(self.config.output, self.config.env) self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) print(self.env.action_space.low, self.env.action_space.high) def train(self, pre_fr=0): t = 0 all_rewards = [] tmp_reward = 0 episode_reward = 0 ep_num = 0 is_win = False state = self.env.reset() for fr in range(pre_fr + 1, self.config.frames + 1): t += 1 # self.env.render() action = self.agent.act(state) action = action + np.random.normal( 0, self.config.exploration_noise, size=self.env.action_space.shape[0]) action = action.clip(self.env.action_space.low, self.env.action_space.high) next_state, reward, done, _ = self.env.step(action) self.agent.buffer.add(state, action, reward, next_state, float(done)) state = next_state episode_reward += reward if fr % self.config.print_interval == 0: print("frames: %5d, reward: %5f, episode: %4d" % (fr, np.mean(all_rewards[-10:]), ep_num)) if fr % self.config.log_interval == 0: self.board_logger.scalar_summary('Reward per episode', ep_num, all_rewards[-1]) if self.config.checkpoint and fr % self.config.checkpoint_interval == 0: self.agent.save_checkpoint(fr, self.outputdir) if done or t == (self.config.max_timesteps): self.agent.learning(fr, t) t = 0 state = self.env.reset() all_rewards.append(episode_reward) episode_reward = 0 ep_num += 1 avg_reward = float(np.mean(all_rewards[-100:])) self.board_logger.scalar_summary( 'Best 100-episodes average reward', ep_num, avg_reward) if len( all_rewards ) >= 100 and avg_reward >= self.config.win_reward and all_rewards[ -1] > self.config.win_reward: is_win = True self.agent.save_model(self.outputdir, 'best') print( 'Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔' % (ep_num, avg_reward, ep_num - 100)) if self.config.win_break: break elif len(all_rewards) >= 100 and avg_reward > tmp_reward: tmp_reward = avg_reward self.agent.save_model(self.outputdir, 'tmp') print( 'Ran %d episodes tmp 100-episodes average reward is %3f. tmp Solved after %d trials' % (ep_num, avg_reward, ep_num - 100)) if not is_win: print('Did not solve after %d episodes' % ep_num) self.agent.save_model(self.outputdir, 'last')
class Trainer: def __init__(self, agent, env, config: Config): self.agent = agent self.env = env self.config = config self.SaveImage = True if not os.path.exists('./history'): os.mkdir('./history') # non-Linear epsilon decay epsilon_final = self.config.epsilon_min epsilon_start = self.config.epsilon epsilon_decay = self.config.eps_decay self.epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) self.outputdir = get_output_folder(self.config.output, self.config.env) self.agent.save_config(self.outputdir) self.board_logger = TensorBoardLogger(self.outputdir) def train(self, pre_fr=0): losses = [] all_rewards = [] episode_reward = 0 ep_num = 0 is_win = False state = self.env.reset() # (360, 480) current_, v, a = self.env.get_info() ''' # RGB state = np.reshape([cv2.resize(state, (self.config.image_size, self.config.image_size)).transpose(2,0,1)], (1, 3, self.config.image_size, self.config.image_size)) #(1, 3, 96, 96) history = np.stack((state, state, state, state, state, state), axis=1) # (1, 3*6, 96, 96) history = np.reshape([np.concatenate(history)], (1, 18, self.config.image_size, self.config.image_size)) #(1, 18, 96, 96) ''' # Gray state = np.reshape([ cv2.resize(state, (self.config.image_size, self.config.image_size)) ], (1, self.config.image_size, self.config.image_size)) # (1, 96, 96) history = np.stack((state, state, state, state, state, state), axis=1) # (1, 6, 96, 96) npv = np.ones( [1, 1, self.config.image_size, self.config.image_size]) * int( 3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2)) # (1, 6, 96, 96) npa = np.ones( [1, 1, self.config.image_size, self.config.image_size]) * int( math.sqrt(a.x**2 + a.y**2 + a.z**2)) # (1, 6, 96, 96) history_value = np.append(history, npv, axis=1) # (1, 7, 96, 96) history_value = np.append(history_value, npa, axis=1) # (1, 8, 96, 96) if self.SaveImage: img = history_value.transpose(0, 2, 3, 1) scipy.misc.imsave('history/history0.jpg', img[0][:, :, 0]) scipy.misc.imsave('history/history1.jpg', img[0][:, :, 1]) scipy.misc.imsave('history/history2.jpg', img[0][:, :, 2]) scipy.misc.imsave('history/history3.jpg', img[0][:, :, 3]) scipy.misc.imsave('history/history4.jpg', img[0][:, :, 4]) scipy.misc.imsave('history/history5.jpg', img[0][:, :, 5]) scipy.misc.imsave('history/history6.jpg', img[0][:, :, 6]) # npv scipy.misc.imsave('history/history7.jpg', img[0][:, :, 7]) # npa for fr in range(pre_fr + 1, self.config.frames + 1): # self.env.render() epsilon = self.epsilon_by_frame(fr) # action = self.agent.act(state, epsilon) action = self.agent.act(history_value, epsilon) next_state, reward, done, _ = self.env.step(action) next_, v, a = self.env.get_info() # Recalculate reward ll = int( math.sqrt((next_.x - current_.x)**2 + (next_.y - current_.y)**2 + (next_.z - current_.z)**2)) vv = int(3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2)) aa = int(math.sqrt(a.x**2 + a.y**2 + a.z**2)) if not done: reward += 2 * (ll - 2) + 3 if vv > 40 else -10 ''' # Draw path trajectory # self.env.draw_waypoint_union(self.env.world.debug, current_, next_) # self.env.draw_string(self.env.world.debug, current_, str('%15.0f km/h' % vv) ) ''' next_history = np.reshape([ cv2.resize(next_state, (self.config.image_size, self.config.image_size)) ], (1, 1, self.config.image_size, self.config.image_size)) # (1, 1, 96, 96) next_history = np.append(next_history, history[:, :5, :, :], axis=1) # (1, 6, 96, 96) npv = np.ones([ 1, 1, self.config.image_size, self.config.image_size ]) * vv # (1, 6, 96, 96) npa = np.ones([ 1, 1, self.config.image_size, self.config.image_size ]) * aa # (1, 6, 96, 96) next_history_value = np.append(next_history, npv, axis=1) # (1, 7, 96, 96) next_history_value = np.append(next_history_value, npa, axis=1) # (1, 8, 96, 96) if self.SaveImage: img = next_history_value.transpose(0, 2, 3, 1) scipy.misc.imsave('history/history' + str(fr) + '0.jpg', img[0][:, :, 0]) scipy.misc.imsave('history/history' + str(fr) + '1.jpg', img[0][:, :, 1]) scipy.misc.imsave('history/history' + str(fr) + '2.jpg', img[0][:, :, 2]) scipy.misc.imsave('history/history' + str(fr) + '3.jpg', img[0][:, :, 3]) scipy.misc.imsave('history/history' + str(fr) + '4.jpg', img[0][:, :, 4]) scipy.misc.imsave('history/history' + str(fr) + '5.jpg', img[0][:, :, 5]) scipy.misc.imsave('history/history' + str(fr) + '6.jpg', img[0][:, :, 6]) # npv scipy.misc.imsave('history/history' + str(fr) + '7.jpg', img[0][:, :, 7]) # npa # self.agent.buffer.add(state, action, reward, next_state, done) self.agent.buffer.add(history_value, action, reward, next_history_value, done) current_ = next_ state = next_state history = next_history history_value = next_history_value episode_reward += reward loss = 0 # if self.agent.buffer.size() > self.config.batch_size: if self.agent.buffer.size() > self.config.min_buff: loss = self.agent.learning(fr) losses.append(loss) self.board_logger.scalar_summary('Loss per frame', fr, loss) if fr % self.config.print_interval == 0: print( "frames: %5d, reward: %5f, loss: %4f episode: %4d, epsilon: %4f" % (fr, np.mean(all_rewards[-10:]), loss, ep_num, self.epsilon_by_frame(fr))) if fr % self.config.log_interval == 0: self.board_logger.scalar_summary('Reward per episode', ep_num, all_rewards[-1]) if self.config.checkpoint and fr % self.config.checkpoint_interval == 0: self.agent.save_checkpoint(fr, self.outputdir) if done: for actor in self.env.actor_list: actor.destroy() # carla.command.DestroyActor(actor) self.env.vehicle.destroy() # print("All cleaned up!") state = self.env.reset() current_, v, a = self.env.get_info() # Gray state = np.reshape([ cv2.resize( state, (self.config.image_size, self.config.image_size)) ], (1, self.config.image_size, self.config.image_size)) # (1, 96, 96) history = np.stack((state, state, state, state, state, state), axis=1) # (1, 6, 96, 96) npv = np.ones([ 1, 1, self.config.image_size, self.config.image_size ]) * int(3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2)) # (1, 6, 96, 96) npa = np.ones([ 1, 1, self.config.image_size, self.config.image_size ]) * int(math.sqrt(a.x**2 + a.y**2 + a.z**2)) # (1, 6, 96, 96) history_value = np.append(history, npv, axis=1) # (1, 7, 96, 96) history_value = np.append(history_value, npa, axis=1) # (1, 8, 96, 96) all_rewards.append(episode_reward) episode_reward = 0 ep_num += 1 avg_reward = float(np.mean(all_rewards[-100:])) self.board_logger.scalar_summary( 'Best 100-episodes average reward', ep_num, avg_reward) if len( all_rewards ) >= 100 and avg_reward >= self.config.win_reward and all_rewards[ -1] > self.config.win_reward: is_win = True self.agent.save_model(self.outputdir, 'best') print( 'Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔' % (ep_num, avg_reward, ep_num - 100)) if self.config.win_break: break if not is_win: print('Did not solve after %d episodes' % ep_num) self.agent.save_model(self.outputdir, 'last')