# set vars sess.run(op_set_lr, feed_dict={lr_in: learning_rate}) print "Using learning rate {}".format(sess.run(lr)) n_ep = 0 n_total_steps = start_step # GoGoGo while n_total_steps <= 2.5e5: cum_reward = 0.0 n_ep_steps = 0 state = env.reset() while True: action = agent.act(state) if action != 3: print_qvals(n_ep_steps, __agent, state, action, AGENT_ACTIONS) next_state, vec_reward, done, env_info = env.step(action) reward, done, reward_info = reward_vector2scalar( action, vec_reward, done, agent.n_skip, agent.cnt_skip) agent_info = agent.step(sess=sess, state=state, action=action, reward=reward, next_state=next_state, episode_done=done) env_info.update(reward_info) summary_proto = log_info( agent_info, env_info, done, cum_reward, n_ep,
next_action = action # cnt_skip = 1 if next_action == 0 else n_skip cnt_skip = int( n_skip * (1 + np.random.rand()) ) # randome start offset to enforce randomness on phase log_info(update_info) while True: n_steps += 1 cnt_skip -= 1 update_info = {} t_learn, t_infer, t_step = 0, 0, 0 # Env step t = time.time() next_state, rewards, done, info = env.step(skip_action) flag_success = done t_step = time.time() - t state, action, reward, next_state, done = \ func_compile_exp_agent(state, action, rewards, next_state, done) flag_tail = done flag_success = True if flag_success and reward > 0.0 else False skip_reward += reward if cnt_skip == 0 or done: # average rewards during skipping skip_reward /= (n_skip - cnt_skip) # add tail for non-early-stops skip_reward += flag_tail * gamma * skip_reward / (1 - gamma) update_info = agent.step(sess=sess,
def exp(dir_prefix, tf_log_dir="ckpt", our_log_dir="logging", replay_cache_dir="ReplayBufferCache", gpu_mem_fraction=0.15, save_checkpoint_secs=3600): n_skip = 6 n_stack = 3 if_random_phase = True # === Agent # --- agent basic ALL_ACTIONS = [(ord(mode),) for mode in ['s', 'd', 'a']] + [(0,)] AGENT_ACTIONS = ALL_ACTIONS[:3] num_actions = len(AGENT_ACTIONS) noop = 3 gamma = 0.9 ckpt_step = 0 greedy_epsilon = CappedLinear(int(3e4)-ckpt_step, 0.2-(0.15/3e4*ckpt_step), 0.05) start_step = ckpt_step*6 # --- replay buffer replay_bucket_size = 100 replay_max_sample_epoch = 2 # --- NN architecture f_net = lambda inputs: f_dueling_q(inputs, num_actions) if_ddqn = True # --- optimization batch_size = 8 learning_rate = 1e-4 target_sync_interval = 1 target_sync_rate = 1e-3 update_interval = 1 max_grad_norm = 1.0 sample_mimimum_count = 100 update_ratio = 8.0 # --- logging and ckpt replay_capacity = 300 replay_ratio_active = 1.0 # === Reward function class FuncReward(object): def __init__(self, gamma): self.__gamma = gamma self._ema_speed = 10.0 self._ema_dist = 0.0 self._obs_risk = 0.0 self._road_change = False self._mom_opp = 0.0 self._mom_biking = 0.0 self._steering = False self._waiting_steps = 0 def reset(self): self._ema_speed = 10.0 self._ema_dist = 0.0 self._obs_risk = 0.0 self._road_change = False self._mom_opp = 0.0 self._mom_biking = 0.0 self._steering = False def _func_scalar_reward(self, rewards, action): """Coverts a vector reward into a scalar.""" info = {} # append a reward that is 1 when action is lane switching rewards = rewards.tolist() print (' '*3 + 'R: [' + '{:4.2f} ' * len(rewards) + ']').format( *rewards), # extract relevant rewards. speed = rewards[0] dist = rewards[1] obs_risk = rewards[2] # road_invalid = rewards[3] > 0.01 # any yellow or red road_change = rewards[4] > 0.01 # entering intersection opp = rewards[5] biking = rewards[6] # inner = rewards[7] # outter = rewards[8] steer = np.logical_or(action == 1, action == 2) if speed < 0.1: self._waiting_steps += 1 else: self._waiting_steps = 0 # update reward-related state vars ema_speed = 0.5 * self._ema_speed + 0.5 * speed ema_dist = 1.0 if dist > 2.0 else 0.9 * self._ema_dist mom_opp = min((opp < 0.5) * (self._mom_opp + 1), 20) mom_biking = min((biking > 0.5) * (self._mom_biking + 1), 12) steering = steer if action != 3 else self._steering self._ema_speed = ema_speed self._ema_dist = ema_dist self._obs_risk = obs_risk self._road_change = road_change self._mom_opp = mom_opp self._mom_biking = mom_biking self._steering = steering print '{:3.0f}, {:3.0f}, {:4.2f}, {:3.0f}'.format( mom_opp, mom_biking, ema_dist, self._steering), info['reward_fun/speed'] = speed info['reward_fun/dist2longest'] = dist info['reward_fun/obs_risk'] = obs_risk info['reward_fun/road_change'] = road_change info['reward_fun/on_opposite'] = opp info['reward_fun/on_biking'] = biking info['reward_fun/steer'] = steer info['reward_fun/mom_opposite'] = mom_opp info['reward_fun/mom_biking'] = mom_biking info['waiting_steps'] = self._waiting_steps # calculate scalar reward reward = [ # velocity speed * 10 - 10, # obs factor -100.0 * obs_risk, # opposite -20 * (0.9 + 0.1 * mom_opp) * (mom_opp > 1.0), # ped -40 * (0.9 + 0.1 * mom_biking) * (mom_biking > 1.0), # steer steering * -40.0, # distance to longest -20.0 * (dist > 3.75/2) ] reward = np.sum(reward) / 100.0 print ': {:5.2f}'.format(reward) return reward, info def _func_early_stopping(self): """Several early stopping criterion.""" info = {} done = False # switched lane while going into intersection. if self._road_change and self._ema_dist > 0.2: print "[Episode early stopping] turned into intersection." done = True info['banned_road_change'] = True # used biking lane to cross intersection if self._road_change and self._mom_biking > 0: print "[Episode early stopping] entered intersection on biking lane." done = True info['banned_road_change'] = True # hit obstacle if self._obs_risk > 1.0: print "[Episode early stopping] hit obstacle." done = True # waiting too long if self._waiting_steps > 80: print "[Episode early stopping] waiting too long" done = True return done, info def _func_skipping_bias(self, reward, done, info, n_skip, cnt_skip): new_info = {} if 'banned_road_change' in info: reward -= 1.0 * (n_skip - cnt_skip) if done: pass new_info['reward_fun/reward'] = reward return reward, new_info def __call__(self, action, rewards, done, n_skip=1, cnt_skip=0): info = {} reward, info_diff = self._func_scalar_reward(rewards, action) info.update(info_diff) early_done, info_diff = self._func_early_stopping() done = done | early_done info.update(info_diff) reward, info_diff = self._func_skipping_bias( reward, done, info, n_skip, cnt_skip) info.update(info_diff) if done: info['flag_success'] = reward > 0.0 self.reset() return reward, done, info # ========================================== # ========================================== # ========================================== env, replay_buffer, _agent = None, None, None try: # Parse flags # FLAGS = tf.app.flags.FLAGS tf_log_dir = os.sep.join([dir_prefix, tf_log_dir]) our_log_dir = os.sep.join([dir_prefix, our_log_dir]) replay_cache_dir = os.sep.join([dir_prefix, replay_cache_dir]) # Modify tf graph graph = tf.get_default_graph() # -- create learning rate var and optimizer lr = tf.get_variable( 'learning_rate', [], dtype=tf.float32, initializer=tf.constant_initializer(1e-3), trainable=False ) lr_in = tf.placeholder(dtype=tf.float32) op_set_lr = tf.assign(lr, lr_in) optimizer_td = tf.train.AdamOptimizer(learning_rate=lr) # -- create global step variable global_step = tf.get_variable( 'global_step', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) def gen_default_backend_cmds(): ws_path = '/Projects/catkin_ws/' initialD_path = '/Projects/hobotrl/playground/initialD/' backend_path = initialD_path + 'ros_environments/backend_scripts/' utils_path = initialD_path + 'ros_environments/backend_scripts/utils/' backend_cmds = [ ['python', utils_path + '/iterate_test_case.py'], # Parse maps ['python', utils_path + 'parse_map.py', ws_path + 'src/Map/src/map_api/data/honda_wider.xodr', utils_path + 'road_segment_info.txt'], # Start roscore ['roscore'], # Reward function script ['python', backend_path + 'gazebo_rl_reward.py'], # Road validity node script ['python', backend_path + 'road_validity.py', utils_path + 'road_segment_info.txt.signal'], # Simulation restarter backend ['python', backend_path+'rviz_restart.py', 'next.launch'], ] return backend_cmds # Environment env = FrameStack(DrSimDecisionK8S(backend_cmds=gen_default_backend_cmds()), n_stack) # Agent replay_buffer = BigPlayback( bucket_cls=MapPlayback, cache_path=replay_cache_dir, capacity=replay_capacity, bucket_size=replay_bucket_size, ratio_active=replay_ratio_active, max_sample_epoch=replay_max_sample_epoch, ) state_shape = env.observation_space.shape __agent = DQN( f_create_q=f_net, state_shape=state_shape, # OneStepTD arguments num_actions=num_actions, discount_factor=gamma, ddqn=if_ddqn, # target network sync arguments target_sync_interval=target_sync_interval, target_sync_rate=target_sync_rate, # epsilon greedy arguments greedy_epsilon=greedy_epsilon, # optimizer arguments network_optimizer=LocalOptimizer(optimizer_td, max_grad_norm), # sampler arguments sampler=TransitionSampler( replay_buffer, batch_size=batch_size, interval=update_interval, minimum_count=sample_mimimum_count), # checkpoint global_step=global_step ) # Utilities stepsSaver = StepsSaver(our_log_dir) reward_vector2scalar = FuncReward(gamma) # Configure sess config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_fraction with __agent.create_session( config=config, save_dir=tf_log_dir, save_checkpoint_secs=save_checkpoint_secs) as sess, \ AsynchronousAgent( agent=__agent, method='ratio', ratio=update_ratio) as _agent: agent = SkippingAgent( # n_skip_vec=(2, 6, 6), agent=_agent, n_skip=n_skip, specific_act=noop ) summary_writer = SummaryWriterCache.get(tf_log_dir) # set vars sess.run(op_set_lr, feed_dict={lr_in: learning_rate}) print "Using learning rate {}".format(sess.run(lr)) n_ep = 0 n_total_steps = start_step # GoGoGo while n_total_steps <= 2.5e5: cum_reward = 0.0 n_ep_steps = 0 state = env.reset() while True: action = agent.act(state, exploration=False) if action != 3: print_qvals( n_ep_steps, __agent, state, action, AGENT_ACTIONS ) next_state, vec_reward, done, env_info = env.step(action) reward, done, reward_info = reward_vector2scalar( action, vec_reward, done, agent.n_skip, agent.cnt_skip ) agent_info = agent.step( sess=sess, state=state, action=action, reward=reward, next_state=next_state, episode_done=done, learning_off=True ) env_info.update(reward_info) summary_proto = log_info( agent_info, env_info, done, cum_reward, n_ep, n_ep_steps, n_total_steps, ) summary_writer.add_summary(summary_proto, n_total_steps) n_total_steps += 1 n_ep_steps += 1 cum_reward += reward flag_success = reward_info['flag_success'] \ if 'flag_success' in reward_info else False stepsSaver.save( n_ep, n_total_steps, state, action, vec_reward, reward, done, cum_reward, flag_success ) state = next_state if done: n_ep += 1 logging.warning( "Episode {} finished in {} steps, reward is {}.".format( n_ep, n_ep_steps, cum_reward, ) ) break if n_ep >= 100: break except Exception as e: print e.message traceback.print_exc() finally: logging.warning("="*30) logging.warning("="*30) logging.warning("Tidying up...") # kill orphaned monitor daemon process if env is not None: env.env.exit() replay_buffer.close() if replay_buffer is not None: replay_buffer.close() if _agent is not None: _agent.stop() # os.killpg(os.getpgid(os.getpid()), signal.SIGKILL) import time logging.warning("waiting for k8s end") time.sleep(180) logging.warning("="*30)