Example #1
0
 def create_sample(args):
     bucket_size = 8
     traj_count = replay_size / max_traj_length
     bucket_count = traj_count / bucket_size
     active_bucket = 4
     ratio = 1.0 * active_bucket / bucket_count
     transition_epoch = 8
     trajectory_epoch = transition_epoch * max_traj_length
     memory = BigPlayback(bucket_cls=Playback,
                          bucket_size=bucket_size,
                          max_sample_epoch=trajectory_epoch,
                          capacity=traj_count,
                          active_ratio=ratio,
                          cache_path=os.sep.join([
                              args.logdir, "cache",
                              str(args.index)
                          ]))
     sampler = sampling.TruncateTrajectorySampler2(
         memory,
         replay_size / max_traj_length,
         max_traj_length,
         batch_size=1,
         trajectory_length=batch_size,
         interval=update_interval)
     return sampler
                     dtype=tf.float32,
                     initializer=tf.constant_initializer(1e-3),
                     trainable=False)
lr_in = tf.placeholder(dtype=tf.float32)
op_set_lr = tf.assign(lr, lr_in)
optimizer_td = tf.train.AdamOptimizer(learning_rate=lr)

global_step = tf.get_variable('global_step', [],
                              dtype=tf.int32,
                              initializer=tf.constant_initializer(0),
                              trainable=False)

replay_buffer = BigPlayback(bucket_cls=BalancedMapPlayback,
                            cache_path=replay_cache_dir,
                            capacity=replay_capacity,
                            bucket_size=replay_bucket_size,
                            ratio_active=replay_ratio_active,
                            max_sample_epoch=replay_max_sample_epoch,
                            num_actions=num_actions,
                            upsample_bias=replay_upsample_bias)

_agent = DQN(
    f_create_q=f_net,
    state_shape=state_shape,
    # OneStepTD arguments
    num_actions=num_actions,
    discount_factor=gamma,
    ddqn=if_ddqn,
    # target network sync arguments
    target_sync_interval=target_sync_interval,
    target_sync_rate=target_sync_rate,
    # epsilon greedy arguments
max_traj_length = 50
batch_size = 8
neighbour_size = 8
bucket_size = 20
update_interval = 8
traj_count = replay_size / max_traj_length
bucket_count = traj_count / bucket_size
active_bucket = 10
ratio = 1.0 * active_bucket / bucket_count
transition_epoch = 8
trajectory_epoch = transition_epoch * max_traj_length

memory = BigPlayback(
    bucket_cls=Playback,
    bucket_size=bucket_size,
    max_sample_epoch=trajectory_epoch,
    capacity=traj_count,
    active_ratio=ratio,
    cache_path="./ReplayBufferCache/experiment",
)
sampler = sampling.TruncateTrajectorySampler2(memory,
                                              replay_size / max_traj_length,
                                              max_traj_length, batch_size,
                                              neighbour_size, update_interval)

_agent = hrl.algorithms.ot.OTDQN(
    f_create_q=f_net,
    lower_weight=4.0,
    upper_weight=4.0,
    neighbour_size=neighbour_size,
    state_shape=state_shape,
    # OneStepTD arguments
Example #4
0
def exp(dir_prefix, tf_log_dir="ckpt", our_log_dir="logging", replay_cache_dir="ReplayBufferCache",
        gpu_mem_fraction=0.15, save_checkpoint_secs=3600):
    n_skip = 6
    n_stack = 3
    if_random_phase = True
    # === Agent
    # --- agent basic
    ALL_ACTIONS = [(ord(mode),) for mode in ['s', 'd', 'a']] + [(0,)]
    AGENT_ACTIONS = ALL_ACTIONS[:3]
    num_actions = len(AGENT_ACTIONS)
    noop = 3
    gamma = 0.9
    ckpt_step = 0
    greedy_epsilon = CappedLinear(int(3e4)-ckpt_step, 0.2-(0.15/3e4*ckpt_step), 0.05)
    start_step = ckpt_step*6
    # --- replay buffer
    replay_bucket_size = 100
    replay_max_sample_epoch = 2
    # --- NN architecture
    f_net = lambda inputs: f_dueling_q(inputs, num_actions)
    if_ddqn = True
    # --- optimization
    batch_size = 8
    learning_rate = 1e-4
    target_sync_interval = 1
    target_sync_rate = 1e-3
    update_interval = 1
    max_grad_norm = 1.0
    sample_mimimum_count = 100
    update_ratio = 8.0
    # --- logging and ckpt
    replay_capacity = 300
    replay_ratio_active = 1.0

    # ===  Reward function
    class FuncReward(object):
        def __init__(self, gamma):
            self.__gamma = gamma
            self._ema_speed = 10.0
            self._ema_dist = 0.0
            self._obs_risk = 0.0
            self._road_change = False
            self._mom_opp = 0.0
            self._mom_biking = 0.0
            self._steering = False
            self._waiting_steps = 0

        def reset(self):
            self._ema_speed = 10.0
            self._ema_dist = 0.0
            self._obs_risk = 0.0
            self._road_change = False
            self._mom_opp = 0.0
            self._mom_biking = 0.0
            self._steering = False

        def _func_scalar_reward(self, rewards, action):
            """Coverts a vector reward into a scalar."""
            info = {}

            # append a reward that is 1 when action is lane switching
            rewards = rewards.tolist()
            print (' '*3 + 'R: [' + '{:4.2f} ' * len(rewards) + ']').format(
                *rewards),

            # extract relevant rewards.
            speed = rewards[0]
            dist = rewards[1]
            obs_risk = rewards[2]
            # road_invalid = rewards[3] > 0.01  # any yellow or red
            road_change = rewards[4] > 0.01  # entering intersection
            opp = rewards[5]
            biking = rewards[6]
            # inner = rewards[7]
            # outter = rewards[8]
            steer = np.logical_or(action == 1, action == 2)
            if speed < 0.1:
                self._waiting_steps += 1
            else:
                self._waiting_steps = 0

            # update reward-related state vars
            ema_speed = 0.5 * self._ema_speed + 0.5 * speed
            ema_dist = 1.0 if dist > 2.0 else 0.9 * self._ema_dist
            mom_opp = min((opp < 0.5) * (self._mom_opp + 1), 20)
            mom_biking = min((biking > 0.5) * (self._mom_biking + 1), 12)
            steering = steer if action != 3 else self._steering
            self._ema_speed = ema_speed
            self._ema_dist = ema_dist
            self._obs_risk = obs_risk
            self._road_change = road_change
            self._mom_opp = mom_opp
            self._mom_biking = mom_biking
            self._steering = steering
            print '{:3.0f}, {:3.0f}, {:4.2f}, {:3.0f}'.format(
                mom_opp, mom_biking, ema_dist, self._steering),
            info['reward_fun/speed'] = speed
            info['reward_fun/dist2longest'] = dist
            info['reward_fun/obs_risk'] = obs_risk
            info['reward_fun/road_change'] = road_change
            info['reward_fun/on_opposite'] = opp
            info['reward_fun/on_biking'] = biking
            info['reward_fun/steer'] = steer
            info['reward_fun/mom_opposite'] = mom_opp
            info['reward_fun/mom_biking'] = mom_biking
            info['waiting_steps'] = self._waiting_steps

            # calculate scalar reward
            reward = [
                # velocity
                speed * 10 - 10,
                # obs factor
                -100.0 * obs_risk,
                # opposite
                -20 * (0.9 + 0.1 * mom_opp) * (mom_opp > 1.0),
                # ped
                -40 * (0.9 + 0.1 * mom_biking) * (mom_biking > 1.0),
                # steer
                steering * -40.0,
                # distance to longest
                -20.0 * (dist > 3.75/2)
            ]
            reward = np.sum(reward) / 100.0
            print ': {:5.2f}'.format(reward)

            return reward, info

        def _func_early_stopping(self):
            """Several early stopping criterion."""
            info = {}
            done = False
            # switched lane while going into intersection.
            if self._road_change and self._ema_dist > 0.2:
                print "[Episode early stopping] turned into intersection."
                done = True
                info['banned_road_change'] = True

            # used biking lane to cross intersection
            if self._road_change and self._mom_biking > 0:
                print "[Episode early stopping] entered intersection on biking lane."
                done = True
                info['banned_road_change'] = True

            # hit obstacle
            if self._obs_risk > 1.0:
                print "[Episode early stopping] hit obstacle."
                done = True

            # waiting too long
            if self._waiting_steps > 80:
                print "[Episode early stopping] waiting too long"
                done = True

            return done, info

        def _func_skipping_bias(self, reward, done, info, n_skip, cnt_skip):
            new_info = {}
            if 'banned_road_change' in info:
                reward -= 1.0 * (n_skip - cnt_skip)
            if done:
                pass
            new_info['reward_fun/reward'] = reward
            return reward, new_info

        def __call__(self, action, rewards, done, n_skip=1, cnt_skip=0):
            info = {}
            reward, info_diff  = self._func_scalar_reward(rewards, action)
            info.update(info_diff)
            early_done, info_diff = self._func_early_stopping()
            done = done | early_done
            info.update(info_diff)
            reward, info_diff = self._func_skipping_bias(
                reward, done, info, n_skip, cnt_skip)
            info.update(info_diff)
            if done:
                info['flag_success'] = reward > 0.0
                self.reset()

            return reward, done, info
    # ==========================================
    # ==========================================
    # ==========================================

    env, replay_buffer, _agent = None, None, None
    try:
        # Parse flags
        # FLAGS = tf.app.flags.FLAGS
        tf_log_dir = os.sep.join([dir_prefix, tf_log_dir])
        our_log_dir = os.sep.join([dir_prefix, our_log_dir])
        replay_cache_dir = os.sep.join([dir_prefix, replay_cache_dir])

        # Modify tf graph
        graph = tf.get_default_graph()
        # -- create learning rate var and optimizer
        lr = tf.get_variable(
            'learning_rate', [], dtype=tf.float32,
            initializer=tf.constant_initializer(1e-3), trainable=False
        )
        lr_in = tf.placeholder(dtype=tf.float32)
        op_set_lr = tf.assign(lr, lr_in)
        optimizer_td = tf.train.AdamOptimizer(learning_rate=lr)
        # -- create global step variable
        global_step = tf.get_variable(
            'global_step', [], dtype=tf.int32,
            initializer=tf.constant_initializer(0), trainable=False)

        def gen_default_backend_cmds():
            ws_path = '/Projects/catkin_ws/'
            initialD_path = '/Projects/hobotrl/playground/initialD/'
            backend_path = initialD_path + 'ros_environments/backend_scripts/'
            utils_path = initialD_path + 'ros_environments/backend_scripts/utils/'
            backend_cmds = [
                ['python', utils_path + '/iterate_test_case.py'],
                # Parse maps
                ['python', utils_path + 'parse_map.py',
                 ws_path + 'src/Map/src/map_api/data/honda_wider.xodr',
                 utils_path + 'road_segment_info.txt'],
                # Start roscore
                ['roscore'],
                # Reward function script
                ['python', backend_path + 'gazebo_rl_reward.py'],
                # Road validity node script
                ['python', backend_path + 'road_validity.py',
                 utils_path + 'road_segment_info.txt.signal'],
                # Simulation restarter backend
                ['python', backend_path+'rviz_restart.py', 'next.launch'],
            ]
            return backend_cmds

        # Environment
        env = FrameStack(DrSimDecisionK8S(backend_cmds=gen_default_backend_cmds()), n_stack)
        # Agent
        replay_buffer = BigPlayback(
            bucket_cls=MapPlayback,
            cache_path=replay_cache_dir,
            capacity=replay_capacity,
            bucket_size=replay_bucket_size,
            ratio_active=replay_ratio_active,
            max_sample_epoch=replay_max_sample_epoch,
        )
        state_shape = env.observation_space.shape
        __agent = DQN(
            f_create_q=f_net, state_shape=state_shape,
            # OneStepTD arguments
            num_actions=num_actions, discount_factor=gamma, ddqn=if_ddqn,
            # target network sync arguments
            target_sync_interval=target_sync_interval,
            target_sync_rate=target_sync_rate,
            # epsilon greedy arguments
            greedy_epsilon=greedy_epsilon,
            # optimizer arguments
            network_optimizer=LocalOptimizer(optimizer_td, max_grad_norm),
            # sampler arguments
            sampler=TransitionSampler(
                replay_buffer,
                batch_size=batch_size,
                interval=update_interval,
                minimum_count=sample_mimimum_count),
            # checkpoint
            global_step=global_step
         )
        # Utilities
        stepsSaver = StepsSaver(our_log_dir)
        reward_vector2scalar = FuncReward(gamma)
        # Configure sess
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_fraction
        with __agent.create_session(
                config=config, save_dir=tf_log_dir,
                save_checkpoint_secs=save_checkpoint_secs) as sess, \
            AsynchronousAgent(
                agent=__agent, method='ratio', ratio=update_ratio) as _agent:
            agent = SkippingAgent(
                # n_skip_vec=(2, 6, 6),
                agent=_agent, n_skip=n_skip, specific_act=noop
            )
            summary_writer = SummaryWriterCache.get(tf_log_dir)
            # set vars
            sess.run(op_set_lr, feed_dict={lr_in: learning_rate})
            print "Using learning rate {}".format(sess.run(lr))
            n_ep = 0
            n_total_steps = start_step
            # GoGoGo
            while n_total_steps <= 2.5e5:
                cum_reward = 0.0
                n_ep_steps = 0
                state = env.reset()
                while True:
                    action = agent.act(state, exploration=False)
                    if action != 3:
                        print_qvals(
                            n_ep_steps, __agent, state, action, AGENT_ACTIONS
                        )
                    next_state, vec_reward, done, env_info = env.step(action)
                    reward, done, reward_info = reward_vector2scalar(
                        action, vec_reward, done, agent.n_skip, agent.cnt_skip
                    )
                    agent_info = agent.step(
                        sess=sess, state=state, action=action,
                        reward=reward, next_state=next_state,
                        episode_done=done, learning_off=True
                    )
                    env_info.update(reward_info)
                    summary_proto = log_info(
                        agent_info, env_info,
                        done,
                        cum_reward,
                        n_ep, n_ep_steps, n_total_steps,
                    )
                    summary_writer.add_summary(summary_proto, n_total_steps)
                    n_total_steps += 1
                    n_ep_steps += 1
                    cum_reward += reward
                    flag_success = reward_info['flag_success'] \
                        if 'flag_success' in reward_info else False
                    stepsSaver.save(
                        n_ep, n_total_steps,
                        state, action, vec_reward, reward, done,
                        cum_reward, flag_success
                    )
                    state = next_state
                    if done:
                        n_ep += 1
                        logging.warning(
                            "Episode {} finished in {} steps, reward is {}.".format(
                                n_ep, n_ep_steps, cum_reward,
                            )
                        )
                        break
                if n_ep >= 100:
                    break

    except Exception as e:
        print e.message
        traceback.print_exc()
    finally:
        logging.warning("="*30)
        logging.warning("="*30)
        logging.warning("Tidying up...")
        # kill orphaned monitor daemon process
        if env is not None:
            env.env.exit()
        replay_buffer.close()
        if replay_buffer is not None:
            replay_buffer.close()
        if _agent is not None:
            _agent.stop()
        # os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
        import time
        logging.warning("waiting for k8s end")
        time.sleep(180)
        logging.warning("="*30)
#     'learning_rate', [], dtype=tf.float32,
#     initializer=tf.constant_initializer(1e-3), trainable=False
# )
# lr_in = tf.placeholder(dtype=tf.float32)
# op_set_lr = tf.assign(lr, lr_in)
optimizer_td = tf.train.AdamOptimizer(learning_rate=1e-4)
global_step = tf.get_variable('global_step', [],
                              dtype=tf.int32,
                              initializer=tf.constant_initializer(0),
                              trainable=False)

# 1 sample ~= 1MB @ 6x skipping
replay_buffer = BigPlayback(bucket_cls=BalancedMapPlayback,
                            cache_path=FLAGS.cache_path,
                            capacity=300000,
                            bucket_size=100,
                            ratio_active=0.05,
                            max_sample_epoch=2,
                            num_actions=len(AGENT_ACTIONS),
                            upsample_bias=(1, 1, 1, 0.1))

gamma = 0.9
_agent = hrl.DQN(
    f_create_q=f_net,
    state_shape=state_shape,
    # OneStepTD arguments
    num_actions=len(AGENT_ACTIONS),
    discount_factor=gamma,
    ddqn=True,
    # target network sync arguments
    target_sync_interval=1,
    target_sync_rate=target_sync_rate,
                      trainable=False)
 lr_in = tf.placeholder(dtype=tf.float32)
 op_set_lr = tf.assign(lr, lr_in)
 optimizer_td = tf.train.AdamOptimizer(learning_rate=lr)
 # -- create global step variable
 global_step = tf.get_variable('global_step', [],
                               dtype=tf.int32,
                               initializer=tf.constant_initializer(0),
                               trainable=False)
 # Environment
 env = FrameStack(DrSimDecisionK8S(), n_stack)
 # Agent
 replay_buffer = BigPlayback(
     bucket_cls=MapPlayback,
     cache_path=replay_cache_dir,
     capacity=replay_capacity,
     bucket_size=replay_bucket_size,
     ratio_active=replay_ratio_active,
     max_sample_epoch=replay_max_sample_epoch,
 )
 state_shape = env.observation_space.shape
 __agent = DQN(
     f_create_q=f_net,
     state_shape=state_shape,
     # OneStepTD arguments
     num_actions=num_actions,
     discount_factor=gamma,
     ddqn=if_ddqn,
     # target network sync arguments
     target_sync_interval=target_sync_interval,
     target_sync_rate=target_sync_rate,
     # epsilon greedy arguments
Example #7
0
op_set_lr = tf.assign(lr, lr_in)
optimizer_td = tf.train.AdamOptimizer(learning_rate=lr)
target_sync_rate = 1e-3
state_shape = env.observation_space.shape
graph = tf.get_default_graph()
global_step = tf.get_variable('global_step', [],
                              dtype=tf.int32,
                              initializer=tf.constant_initializer(0),
                              trainable=False)

# 1 sample ~= 1MB @ 6x skipping
replay_buffer = BigPlayback(
    bucket_cls=BalancedMapPlayback,
    cache_path=FLAGS.cache_path,
    capacity=300000,
    bucket_size=100,
    ratio_active=0.05,
    max_sample_epoch=2,
    num_actions=NUM_PROXY_ACTION,
    upsample_bias=tuple([1.0 for _ in range(NUM_PROXY_ACTION)] + [0.1]))

gamma = 0.9
_agent = hrl.DQN(
    f_create_q=f_net,
    state_shape=state_shape,
    # OneStepTD arguments
    num_actions=NUM_PROXY_ACTION,
    discount_factor=gamma,
    ddqn=True,
    # target network sync arguments
    target_sync_interval=1,
Example #8
0
lr = tf.get_variable(
    'learning_rate', [], dtype=tf.float32,
    initializer=tf.constant_initializer(1e-3), trainable=False
)
lr_in = tf.placeholder(dtype=tf.float32)
op_set_lr = tf.assign(lr, lr_in)
optimizer_td = tf.train.AdamOptimizer(learning_rate=1e-4)
global_step = tf.get_variable(
    'global_step', [], dtype=tf.int32,
    initializer=tf.constant_initializer(0), trainable=False)
op_global_step_set = tf.assign(global_step, 0)

# 1 sample ~= 1MB @ 6x skipping
replay_buffer = BigPlayback(
    bucket_cls=BalancedMapPlayback,
    cache_path="./ReplayBufferCache/experiment_255543117017472316",
    capacity=900000, bucket_size=300, ratio_active=0.05, max_sample_epoch=2,
    num_actions=len(AGENT_ACTIONS), upsample_bias=(1,1,1,0.1)
)

gamma = 0.9
_agent = hrl.DQN(
    f_create_q=f_net, state_shape=state_shape,
    # OneStepTD arguments
    num_actions=len(AGENT_ACTIONS), discount_factor=gamma, ddqn=True,
    # target network sync arguments
    target_sync_interval=1,
    target_sync_rate=target_sync_rate,
    # epsilon greeedy arguments
    # greedy_epsilon=0.05,
    greedy_epsilon=CappedLinear(10000, 0.2, 0.05),
    # greedy_epsilon=CappedLinear(10000, 0.1, 0.025),