upsample_bias=replay_upsample_bias)

_agent = DQN(
    f_create_q=f_net,
    state_shape=state_shape,
    # OneStepTD arguments
    num_actions=num_actions,
    discount_factor=gamma,
    ddqn=if_ddqn,
    # target network sync arguments
    target_sync_interval=target_sync_interval,
    target_sync_rate=target_sync_rate,
    # epsilon greedy arguments
    greedy_epsilon=greedy_epsilon,
    # optimizer arguments
    network_optimizer=LocalOptimizer(optimizer_td, max_grad_norm),
    # sampler arguments
    sampler=TransitionSampler(replay_buffer,
                              batch_size=batch_size,
                              interval=update_interval,
                              minimum_count=sample_mimimum_count),
    # checkpoint
    global_step=global_step)


def func_compile_exp_agent(state, action, rewards, next_state, done):
    global cnt_skip
    global n_skip
    global mom_opp
    global mom_biking
    global ema_dist
Exemple #2
0
    def __init__(self, env=None, episode_n=1000, discount_factor=0.95,
                f_se=None, f_actor=None, f_critic=None, network_optimizer_ctor=None,
                target_sync_interval=10, target_sync_rate=0.01, batch_size=32,
                ou_params=None, replay_capacity=10000, **kwargs):
        # environment
        if env is None:
            env = gym.make("CarRacing-v0")
            env = CarRacingGoalWrapper(env)
            env = NoneSkipWrapper(env, skip=5, render_all_steps=True)
            env = CarGrassWrapper(env, grass_penalty=0.5)
            env = ScaledFloatFrame(env)
            env = FrameStack(env, 4)
            env = ScaledRewards(env, 0.2)

        # network
        l2 = 1e-7
        nonlinear = tf.nn.elu
        dim_se = 256
        dim_action = env.action_space.shape[-1]
        if f_se is None:
            def f(inputs):
                input_observation = inputs[0]
                se_conv = Utils.conv2ds(
                    input_observation, shape=[(8, 8, 4), (16, 4, 2), (32, 3, 2)],
                    out_flatten=True, activation=nonlinear,
                    l2=l2, var_scope="se_conv"
                )

                se_linear = Utils.layer_fcs(
                    se_conv, [256], dim_se,
                    activation_hidden=nonlinear, activation_out=None,
                    l2=l2, var_scope="se_linear"
                )
                return {"se": se_linear}
            f_se = f

        if f_actor is None:
            def f(inputs):
                se = inputs[0]
                action = Utils.layer_fcs(
                    se, [256, 256], dim_action,
                    activation_hidden=nonlinear, activation_out=tf.nn.tanh,
                    l2=l2, var_scope="actor"
                )
                return {"action": action}

            f_actor = f
        if f_critic is None:
            def f(inputs):
                se, action = inputs[0], inputs[1]
                se = tf.concat([se, action], axis=-1)
                q = Utils.layer_fcs(
                    se, [256, 256], 1,
                    activation_hidden=nonlinear, activation_out=None,
                    l2=l2, var_scope="q"
                )
                q = tf.squeeze(q, axis=1)
                return {"q": q}

            f_critic = f
        if network_optimizer_ctor is None:
            # network_optimizer_ctor = lambda: LocalOptimizer(tf.train.AdamOptimizer(0.0), grad_clip=10.0)
            network_optimizer_ctor = lambda: LocalOptimizer(tf.train.AdamOptimizer(3e-5), grad_clip=10.0)

        if ou_params is None:
            ou_params = (0, 0.2, CappedExp(2e5, 0.5, 0.02))
            # ou_params = (0, 1, 0)

        super(DDPGCarRacingSoftGoal, self).__init__(
            env, f_se, f_actor, f_critic, episode_n, discount_factor, network_optimizer_ctor,
            ou_params, target_sync_interval, target_sync_rate, batch_size, replay_capacity,
            **kwargs
        )
Exemple #3
0
def exp(dir_prefix, tf_log_dir="ckpt", our_log_dir="logging", replay_cache_dir="ReplayBufferCache",
        gpu_mem_fraction=0.15, save_checkpoint_secs=3600):
    n_skip = 6
    n_stack = 3
    if_random_phase = True
    # === Agent
    # --- agent basic
    ALL_ACTIONS = [(ord(mode),) for mode in ['s', 'd', 'a']] + [(0,)]
    AGENT_ACTIONS = ALL_ACTIONS[:3]
    num_actions = len(AGENT_ACTIONS)
    noop = 3
    gamma = 0.9
    ckpt_step = 0
    greedy_epsilon = CappedLinear(int(3e4)-ckpt_step, 0.2-(0.15/3e4*ckpt_step), 0.05)
    start_step = ckpt_step*6
    # --- replay buffer
    replay_bucket_size = 100
    replay_max_sample_epoch = 2
    # --- NN architecture
    f_net = lambda inputs: f_dueling_q(inputs, num_actions)
    if_ddqn = True
    # --- optimization
    batch_size = 8
    learning_rate = 1e-4
    target_sync_interval = 1
    target_sync_rate = 1e-3
    update_interval = 1
    max_grad_norm = 1.0
    sample_mimimum_count = 100
    update_ratio = 8.0
    # --- logging and ckpt
    replay_capacity = 300
    replay_ratio_active = 1.0

    # ===  Reward function
    class FuncReward(object):
        def __init__(self, gamma):
            self.__gamma = gamma
            self._ema_speed = 10.0
            self._ema_dist = 0.0
            self._obs_risk = 0.0
            self._road_change = False
            self._mom_opp = 0.0
            self._mom_biking = 0.0
            self._steering = False
            self._waiting_steps = 0

        def reset(self):
            self._ema_speed = 10.0
            self._ema_dist = 0.0
            self._obs_risk = 0.0
            self._road_change = False
            self._mom_opp = 0.0
            self._mom_biking = 0.0
            self._steering = False

        def _func_scalar_reward(self, rewards, action):
            """Coverts a vector reward into a scalar."""
            info = {}

            # append a reward that is 1 when action is lane switching
            rewards = rewards.tolist()
            print (' '*3 + 'R: [' + '{:4.2f} ' * len(rewards) + ']').format(
                *rewards),

            # extract relevant rewards.
            speed = rewards[0]
            dist = rewards[1]
            obs_risk = rewards[2]
            # road_invalid = rewards[3] > 0.01  # any yellow or red
            road_change = rewards[4] > 0.01  # entering intersection
            opp = rewards[5]
            biking = rewards[6]
            # inner = rewards[7]
            # outter = rewards[8]
            steer = np.logical_or(action == 1, action == 2)
            if speed < 0.1:
                self._waiting_steps += 1
            else:
                self._waiting_steps = 0

            # update reward-related state vars
            ema_speed = 0.5 * self._ema_speed + 0.5 * speed
            ema_dist = 1.0 if dist > 2.0 else 0.9 * self._ema_dist
            mom_opp = min((opp < 0.5) * (self._mom_opp + 1), 20)
            mom_biking = min((biking > 0.5) * (self._mom_biking + 1), 12)
            steering = steer if action != 3 else self._steering
            self._ema_speed = ema_speed
            self._ema_dist = ema_dist
            self._obs_risk = obs_risk
            self._road_change = road_change
            self._mom_opp = mom_opp
            self._mom_biking = mom_biking
            self._steering = steering
            print '{:3.0f}, {:3.0f}, {:4.2f}, {:3.0f}'.format(
                mom_opp, mom_biking, ema_dist, self._steering),
            info['reward_fun/speed'] = speed
            info['reward_fun/dist2longest'] = dist
            info['reward_fun/obs_risk'] = obs_risk
            info['reward_fun/road_change'] = road_change
            info['reward_fun/on_opposite'] = opp
            info['reward_fun/on_biking'] = biking
            info['reward_fun/steer'] = steer
            info['reward_fun/mom_opposite'] = mom_opp
            info['reward_fun/mom_biking'] = mom_biking
            info['waiting_steps'] = self._waiting_steps

            # calculate scalar reward
            reward = [
                # velocity
                speed * 10 - 10,
                # obs factor
                -100.0 * obs_risk,
                # opposite
                -20 * (0.9 + 0.1 * mom_opp) * (mom_opp > 1.0),
                # ped
                -40 * (0.9 + 0.1 * mom_biking) * (mom_biking > 1.0),
                # steer
                steering * -40.0,
                # distance to longest
                -20.0 * (dist > 3.75/2)
            ]
            reward = np.sum(reward) / 100.0
            print ': {:5.2f}'.format(reward)

            return reward, info

        def _func_early_stopping(self):
            """Several early stopping criterion."""
            info = {}
            done = False
            # switched lane while going into intersection.
            if self._road_change and self._ema_dist > 0.2:
                print "[Episode early stopping] turned into intersection."
                done = True
                info['banned_road_change'] = True

            # used biking lane to cross intersection
            if self._road_change and self._mom_biking > 0:
                print "[Episode early stopping] entered intersection on biking lane."
                done = True
                info['banned_road_change'] = True

            # hit obstacle
            if self._obs_risk > 1.0:
                print "[Episode early stopping] hit obstacle."
                done = True

            # waiting too long
            if self._waiting_steps > 80:
                print "[Episode early stopping] waiting too long"
                done = True

            return done, info

        def _func_skipping_bias(self, reward, done, info, n_skip, cnt_skip):
            new_info = {}
            if 'banned_road_change' in info:
                reward -= 1.0 * (n_skip - cnt_skip)
            if done:
                pass
            new_info['reward_fun/reward'] = reward
            return reward, new_info

        def __call__(self, action, rewards, done, n_skip=1, cnt_skip=0):
            info = {}
            reward, info_diff  = self._func_scalar_reward(rewards, action)
            info.update(info_diff)
            early_done, info_diff = self._func_early_stopping()
            done = done | early_done
            info.update(info_diff)
            reward, info_diff = self._func_skipping_bias(
                reward, done, info, n_skip, cnt_skip)
            info.update(info_diff)
            if done:
                info['flag_success'] = reward > 0.0
                self.reset()

            return reward, done, info
    # ==========================================
    # ==========================================
    # ==========================================

    env, replay_buffer, _agent = None, None, None
    try:
        # Parse flags
        # FLAGS = tf.app.flags.FLAGS
        tf_log_dir = os.sep.join([dir_prefix, tf_log_dir])
        our_log_dir = os.sep.join([dir_prefix, our_log_dir])
        replay_cache_dir = os.sep.join([dir_prefix, replay_cache_dir])

        # Modify tf graph
        graph = tf.get_default_graph()
        # -- create learning rate var and optimizer
        lr = tf.get_variable(
            'learning_rate', [], dtype=tf.float32,
            initializer=tf.constant_initializer(1e-3), trainable=False
        )
        lr_in = tf.placeholder(dtype=tf.float32)
        op_set_lr = tf.assign(lr, lr_in)
        optimizer_td = tf.train.AdamOptimizer(learning_rate=lr)
        # -- create global step variable
        global_step = tf.get_variable(
            'global_step', [], dtype=tf.int32,
            initializer=tf.constant_initializer(0), trainable=False)

        def gen_default_backend_cmds():
            ws_path = '/Projects/catkin_ws/'
            initialD_path = '/Projects/hobotrl/playground/initialD/'
            backend_path = initialD_path + 'ros_environments/backend_scripts/'
            utils_path = initialD_path + 'ros_environments/backend_scripts/utils/'
            backend_cmds = [
                ['python', utils_path + '/iterate_test_case.py'],
                # Parse maps
                ['python', utils_path + 'parse_map.py',
                 ws_path + 'src/Map/src/map_api/data/honda_wider.xodr',
                 utils_path + 'road_segment_info.txt'],
                # Start roscore
                ['roscore'],
                # Reward function script
                ['python', backend_path + 'gazebo_rl_reward.py'],
                # Road validity node script
                ['python', backend_path + 'road_validity.py',
                 utils_path + 'road_segment_info.txt.signal'],
                # Simulation restarter backend
                ['python', backend_path+'rviz_restart.py', 'next.launch'],
            ]
            return backend_cmds

        # Environment
        env = FrameStack(DrSimDecisionK8S(backend_cmds=gen_default_backend_cmds()), n_stack)
        # Agent
        replay_buffer = BigPlayback(
            bucket_cls=MapPlayback,
            cache_path=replay_cache_dir,
            capacity=replay_capacity,
            bucket_size=replay_bucket_size,
            ratio_active=replay_ratio_active,
            max_sample_epoch=replay_max_sample_epoch,
        )
        state_shape = env.observation_space.shape
        __agent = DQN(
            f_create_q=f_net, state_shape=state_shape,
            # OneStepTD arguments
            num_actions=num_actions, discount_factor=gamma, ddqn=if_ddqn,
            # target network sync arguments
            target_sync_interval=target_sync_interval,
            target_sync_rate=target_sync_rate,
            # epsilon greedy arguments
            greedy_epsilon=greedy_epsilon,
            # optimizer arguments
            network_optimizer=LocalOptimizer(optimizer_td, max_grad_norm),
            # sampler arguments
            sampler=TransitionSampler(
                replay_buffer,
                batch_size=batch_size,
                interval=update_interval,
                minimum_count=sample_mimimum_count),
            # checkpoint
            global_step=global_step
         )
        # Utilities
        stepsSaver = StepsSaver(our_log_dir)
        reward_vector2scalar = FuncReward(gamma)
        # Configure sess
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_fraction
        with __agent.create_session(
                config=config, save_dir=tf_log_dir,
                save_checkpoint_secs=save_checkpoint_secs) as sess, \
            AsynchronousAgent(
                agent=__agent, method='ratio', ratio=update_ratio) as _agent:
            agent = SkippingAgent(
                # n_skip_vec=(2, 6, 6),
                agent=_agent, n_skip=n_skip, specific_act=noop
            )
            summary_writer = SummaryWriterCache.get(tf_log_dir)
            # set vars
            sess.run(op_set_lr, feed_dict={lr_in: learning_rate})
            print "Using learning rate {}".format(sess.run(lr))
            n_ep = 0
            n_total_steps = start_step
            # GoGoGo
            while n_total_steps <= 2.5e5:
                cum_reward = 0.0
                n_ep_steps = 0
                state = env.reset()
                while True:
                    action = agent.act(state, exploration=False)
                    if action != 3:
                        print_qvals(
                            n_ep_steps, __agent, state, action, AGENT_ACTIONS
                        )
                    next_state, vec_reward, done, env_info = env.step(action)
                    reward, done, reward_info = reward_vector2scalar(
                        action, vec_reward, done, agent.n_skip, agent.cnt_skip
                    )
                    agent_info = agent.step(
                        sess=sess, state=state, action=action,
                        reward=reward, next_state=next_state,
                        episode_done=done, learning_off=True
                    )
                    env_info.update(reward_info)
                    summary_proto = log_info(
                        agent_info, env_info,
                        done,
                        cum_reward,
                        n_ep, n_ep_steps, n_total_steps,
                    )
                    summary_writer.add_summary(summary_proto, n_total_steps)
                    n_total_steps += 1
                    n_ep_steps += 1
                    cum_reward += reward
                    flag_success = reward_info['flag_success'] \
                        if 'flag_success' in reward_info else False
                    stepsSaver.save(
                        n_ep, n_total_steps,
                        state, action, vec_reward, reward, done,
                        cum_reward, flag_success
                    )
                    state = next_state
                    if done:
                        n_ep += 1
                        logging.warning(
                            "Episode {} finished in {} steps, reward is {}.".format(
                                n_ep, n_ep_steps, cum_reward,
                            )
                        )
                        break
                if n_ep >= 100:
                    break

    except Exception as e:
        print e.message
        traceback.print_exc()
    finally:
        logging.warning("="*30)
        logging.warning("="*30)
        logging.warning("Tidying up...")
        # kill orphaned monitor daemon process
        if env is not None:
            env.env.exit()
        replay_buffer.close()
        if replay_buffer is not None:
            replay_buffer.close()
        if _agent is not None:
            _agent.stop()
        # os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
        import time
        logging.warning("waiting for k8s end")
        time.sleep(180)
        logging.warning("="*30)
Exemple #4
0
 f_q = f_normal2
 input_state = tf.placeholder(dtype=tf.float32,
                              shape=[None, dim_state],
                              name="input_state")
 input_action = tf.placeholder(dtype=tf.float32,
                               shape=[None, dim_action],
                               name="input_action")
 input_noise = tf.placeholder(dtype=tf.float32,
                              shape=[None, dim_noise],
                              name="input_noise")
 generator_net = Network([input_state, input_noise],
                         f_generator,
                         var_scope="generator")
 generator_func = NetworkFunction(generator_net["action"])
 logprob_net = Network([input_state, input_action], f_q, var_scope="q")
 optimizer = LocalOptimizer()
 batch_size = 8
 m_particles = 128
 test_batch = 256
 optimizer.add_updater(SVGDUpdater(generator_net,
                                   logprob_net,
                                   m_particles=m_particles),
                       name="svgd")
 optimizer.compile()
 plt.ion()
 plt.show()
 for g in range(8):
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         generator_func.set_session(sess)
         for i in range(1000):