Example #1
0
def main(args):
    hparams = HyperParameters()
    hparams.summary_dir = FLAGS.summary_dir if FLAGS.summary_dir else hparams.summary_dir

    if FLAGS.phase == 'train':
        train_dataset = DataSet(hparams.train_image_dir,
                                hparams.batch_size, [224, 224, 3],
                                len(hparams.license_number_list),
                                include_label=True,
                                shuffle=True,
                                augmented=True)
        val_dataset = DataSet(hparams.val_image_dir,
                              hparams.batch_size, [224, 224, 3],
                              len(hparams.license_number_list),
                              include_label=True,
                              shuffle=False,
                              augmented=False)
        # test_dataset = DataSet(hparams.test_image_dir,
        #                        hparams.batch_size, [224, 224, 3],
        #                        is_train=False,
        #                        shuffle=False,
        #                        augmented=False)

        with tf.Session() as sess:
            model = Recognizer(hparams, trainable=True)
            model.train(sess,
                        train_dataset=train_dataset,
                        val_dataset=val_dataset,
                        load_checkpoint=FLAGS.load_checkpoint,
                        checkpoint=FLAGS.checkpoint)
    elif FLAGS.phase == 'eval':
        test_dataset = DataSet(hparams.test_image_dir,
                               hparams.batch_size, [224, 224, 3],
                               len(hparams.license_number_list),
                               include_label=True,
                               shuffle=False,
                               augmented=False)
        with tf.Session() as sess:
            model = Recognizer(hparams, trainable=True)
            model.eval(sess, test_dataset, checkpoint=FLAGS.checkpoint)
    else:
        test_dataset = DataSet(hparams.test_image_dir,
                               hparams.batch_size, [224, 224, 3],
                               len(hparams.license_number_list),
                               include_label=False,
                               shuffle=False,
                               augmented=False)
        with tf.Session() as sess:
            model = Recognizer(hparams, trainable=True)
            model.test(sess, test_dataset, checkpoint=FLAGS.checkpoint)
Example #2
0
    def __init__(self, name):
        self.config = HyperParameters()
        self.all_state_size = self.config.all_state_size
        self.action_size = self.config.action_size
        self.tau = self.config.tau

        initial_learning_rate = self.config.lrc
        global_step = tf.Variable(0, trainable=False)
        self.learning_rate = tf.train.exponential_decay(
            initial_learning_rate,
            global_step=global_step,
            decay_steps=200000,
            decay_rate=0.99,
            staircase=True,
        )
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)
        self.optimizer_2 = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        (
            self.state_inputs,
            self.action,
            self.critic_variables,
            self.q_value,
        ) = self.build_critic_network(name)
        (
            self.state_inputs_target,
            self.action_target,
            self.critic_variables_target,
            self.q_value_target,
        ) = self.build_critic_network(name + "_target")

        self.target = tf.compat.v1.placeholder(tf.float32,
                                               [None, self.config.task_size])
        self.ISWeights = tf.compat.v1.placeholder(tf.float32, [None, 1])
        self.absolute_errors = tf.abs(self.target -
                                      self.q_value)  # for updating sumtree
        self.action_gradients = tf.gradients(self.q_value, self.action)

        self.loss = tf.reduce_mean(
            self.ISWeights * tf.compat.v1.losses.huber_loss(
                labels=self.target, predictions=self.q_value))
        self.loss_2 = tf.reduce_mean(
            tf.compat.v1.losses.huber_loss(labels=self.target,
                                           predictions=self.q_value))
        self.optimize = self.optimizer.minimize(
            self.loss)  # global_step=global_step
        self.optimize_2 = self.optimizer_2.minimize(self.loss_2)

        self.update_target_op = [
            self.critic_variables_target[i].assign(
                tf.multiply(self.critic_variables[i], self.tau) +
                tf.multiply(self.critic_variables_target[i], 1 - self.tau))
            for i in range(len(self.critic_variables))
        ]
Example #3
0
    def __init__(self, name):
        # learning params
        self.config = HyperParameters()
        self.all_state_size = self.config.all_state_size
        self.action_size = self.config.action_size
        self.tau = self.config.tau

        # network params
        self.feature_head = 1
        self.features_per_head = 64
        initial_learning_rate = self.config.lra
        global_step = tf.Variable(0, trainable=False)
        self.learning_rate = tf.train.exponential_decay(
            initial_learning_rate,
            global_step=global_step,
            decay_steps=200000,
            decay_rate=0.99,
            staircase=True,
        )
        self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)

        (
            self.state_inputs,
            self.actor_variables,
            self.action,
            self.attention_matrix,
        ) = self.build_actor_network(name)
        (
            self.state_inputs_target,
            self.actor_variables_target,
            self.action_target,
            self.attention_matrix_target,
        ) = self.build_actor_network(name + "_target")

        self.action_gradients = tf.compat.v1.placeholder(
            tf.float32, [None, self.action_size], name="action_gradients")
        self.actor_gradients = tf.compat.v1.gradients(self.action,
                                                      self.actor_variables,
                                                      -self.action_gradients)
        self.optimize = self.optimizer.apply_gradients(
            zip(self.actor_gradients,
                self.actor_variables))  # global_step=global_step

        self.update_target_op = [
            self.actor_variables_target[i].assign(
                tf.multiply(self.actor_variables[i], self.tau) +
                tf.multiply(self.actor_variables_target[i], 1 - self.tau))
            for i in range(len(self.actor_variables))
        ]
Example #4
0
def train(
    training_scenarios,
    sim_name,
    headless,
    num_episodes,
    seed,
    without_soc_mt,
    session_dir,
):
    WITH_SOC_MT = without_soc_mt
    config = HyperParameters()
    configProto = init_tensorflow()

    # init env
    agent_spec = AgentSpec(
        # you can custom AgentInterface to control what obs information you need and the action type
        interface=cross_interface,
        # agent_builder=actor,
        # you can custom your observation adapter, reward adapter, info adapter, action adapter and so on.
        observation_adapter=observation_adapter,
        reward_adapter=reward_adapter,
        action_adapter=action_adapter,
    )

    env = gym.make(
        "smarts.env:hiway-v0",
        scenarios=training_scenarios,
        agent_specs={AGENT_ID: agent_spec},
        sim_name=sim_name,
        headless=headless,
        timestep_sec=0.1,
        seed=seed,
    )

    # init nets structure
    if WITH_SOC_MT:
        model_name = "Soc_Mt_TD3Network"
        actor = SocMtActorNetwork(name="actor")
        critic_1 = SocMtCriticNetwork(name="critic_1")
        critic_2 = SocMtCriticNetwork(name="critic_2")
    else:
        model_name = "TD3Network"
        actor = ActorNetwork(name="actor")
        critic_1 = CriticNetwork(name="critic_1")
        critic_2 = CriticNetwork(name="critic_2")
    # tensorflow summary for tensorboard visualization
    writer = tf.compat.v1.summary.FileWriter("summary")
    # losses
    tf.compat.v1.summary.scalar("Loss", critic_1.loss)
    tf.compat.v1.summary.scalar("Hubor_loss", critic_1.loss_2)
    tf.compat.v1.summary.histogram("ISWeights", critic_1.ISWeights)
    write_op = tf.compat.v1.summary.merge_all()
    saver = tf.compat.v1.train.Saver(max_to_keep=1000)

    # init memory buffer
    buffer = Buffer(config.buffer_size, config.pretrain_length)
    if config.load_buffer:  # !!!the capacity of the buffer is limited with buffer file
        buffer = buffer.load_buffer(config.buffer_load_path)
        print("BUFFER: Buffer Loaded")
    else:
        buffer.fill_buffer(env, AGENT_ID)
        print("BUFFER: Buffer Filled")
        buffer.save_buffer(config.buffer_save_path, buffer)
    print("BUFFER: Buffer initialize")

    with tf.compat.v1.Session(config=configProto) as sess:
        # init nets params
        sess.run(tf.compat.v1.global_variables_initializer())
        writer.add_graph(sess.graph)
        # update params of the target network
        actor.update_target(sess)
        critic_1.update_target(sess)
        critic_2.update_target(sess)

        # Reinforcement Learning loop
        print("Training Starts...")
        # experiment results
        recent_rewards = []  # rewards from recent 100 episodes
        avarage_rewards = []  # avareage reward of recent 100 episodes
        recent_success = []
        recent_success_rate = []
        EPSILON = 1

        for episode in episodes(n=num_episodes):
            env_steps = 0
            # save the model from time to time
            if config.model_save_frequency:
                if episode.index % config.model_save_frequency == 0:
                    save_path = saver.save(sess,
                                           f"{session_dir}/{model_name}.ckpt")
                    print("latest model saved")
                if episode.index % config.model_save_frequency_no_paste == 0:
                    saver.save(
                        sess,
                        f"{session_dir}/{model_name}_{str(episode.index)}.ckpt",
                    )
                    print("model saved")

            # initialize
            EPSILON = (config.noised_episodes -
                       episode.index) / config.noised_episodes
            episode_reward = 0

            observations = env.reset()  # states of all vehs
            state = observations[AGENT_ID]  # ego state
            episode.record_scenario(env.scenario_log)
            dones = {"__all__": False}
            while not dones["__all__"]:
                action_noise = actor.get_action_noise(sess,
                                                      state,
                                                      rate=EPSILON)
                observations, rewards, dones, infos = env.step(
                    {AGENT_ID:
                     action_noise})  # states of all vehs in next step

                # ego state in next step
                next_state = observations[AGENT_ID]
                if WITH_SOC_MT:
                    reward = rewards[AGENT_ID]
                else:
                    reward = np.sum(rewards.values())
                done = dones[AGENT_ID]
                info = infos[AGENT_ID]
                aux_info = get_aux_info(infos[AGENT_ID]["env_obs"])
                episode.record_step(observations, rewards, dones, infos)
                if WITH_SOC_MT:
                    episode_reward += np.sum(reward)
                else:
                    episode_reward += reward

                # store the experience
                experience = state, action_noise, reward, next_state, done
                # print(state)
                buffer.store(experience)

                ## Model training STARTS
                if env_steps % config.train_frequency == 0:
                    # "Delayed" Policy Updates
                    policy_delayed = 2
                    for _ in range(policy_delayed):
                        # First we need a mini-batch with experiences (s, a, r, s', done)
                        tree_idx, batch, ISWeights_mb = buffer.sample(
                            config.batch_size)
                        s_mb, a_mb, r_mb, next_s_mb, dones_mb = get_split_batch(
                            batch)
                        task_mb = s_mb[:, -config.task_size:]
                        next_task_mb = next_s_mb[:, -config.task_size:]

                        # Get q_target values for next_state from the critic_target
                        if WITH_SOC_MT:
                            a_target_next_state = actor.get_action_target(
                                sess,
                                next_s_mb)  # with Target Policy Smoothing
                            q_target_next_state_1 = critic_1.get_q_value_target(
                                sess, next_s_mb, a_target_next_state)
                            q_target_next_state_1 = (q_target_next_state_1 *
                                                     next_task_mb
                                                     )  # multi task q value
                            q_target_next_state_2 = critic_2.get_q_value_target(
                                sess, next_s_mb, a_target_next_state)
                            q_target_next_state_2 = (q_target_next_state_2 *
                                                     next_task_mb
                                                     )  # multi task q value
                            q_target_next_state = np.minimum(
                                q_target_next_state_1, q_target_next_state_2)
                        else:
                            a_target_next_state = actor.get_action_target(
                                sess,
                                next_s_mb)  # with Target Policy Smoothing
                            q_target_next_state_1 = critic_1.get_q_value_target(
                                sess, next_s_mb, a_target_next_state)
                            q_target_next_state_2 = critic_2.get_q_value_target(
                                sess, next_s_mb, a_target_next_state)
                            q_target_next_state = np.minimum(
                                q_target_next_state_1, q_target_next_state_2)

                        # Set Q_target = r if the episode ends at s+1, otherwise Q_target = r + gamma * Qtarget(s',a')
                        target_Qs_batch = []
                        for i in range(0, len(dones_mb)):
                            terminal = dones_mb[i]
                            # if we are in a terminal state. only equals reward
                            if terminal:
                                target_Qs_batch.append((r_mb[i] * task_mb[i]))
                            else:
                                # take the Q taregt for action a'
                                target = (
                                    r_mb[i] * task_mb[i] +
                                    config.gamma * q_target_next_state[i])
                                target_Qs_batch.append(target)
                        targets_mb = np.array(
                            [each for each in target_Qs_batch])

                        # critic train
                        if len(a_mb.shape) > 2:
                            a_mb = np.squeeze(a_mb, axis=1)
                        loss, absolute_errors = critic_1.train(
                            sess, s_mb, a_mb, targets_mb, ISWeights_mb)
                        loss_2, absolute_errors_2 = critic_2.train(
                            sess, s_mb, a_mb, targets_mb, ISWeights_mb)
                    # actor train
                    a_for_grad = actor.get_action(sess, s_mb)
                    a_gradients = critic_1.get_gradients(
                        sess, s_mb, a_for_grad)
                    # print(a_gradients)
                    actor.train(sess, s_mb, a_gradients[0])
                    # target train
                    actor.update_target(sess)
                    critic_1.update_target(sess)
                    critic_2.update_target(sess)

                    # update replay memory priorities
                    if WITH_SOC_MT:
                        absolute_errors = np.sum(absolute_errors, axis=1)
                    buffer.batch_update(tree_idx, absolute_errors)
                    ## Model training ENDS

                if done:
                    # visualize reward data
                    recent_rewards.append(episode_reward)
                    if len(recent_rewards) > 100:
                        recent_rewards.pop(0)
                    avarage_rewards.append(np.mean(recent_rewards))
                    avarage_rewards_data = np.array(avarage_rewards)
                    d = {"avarage_rewards": avarage_rewards_data}
                    with open(os.path.join("results", "reward_data" + ".pkl"),
                              "wb") as f:
                        pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
                    # visualize success rate data
                    if aux_info == "success":
                        recent_success.append(1)
                    else:
                        recent_success.append(0)
                    if len(recent_success) > 100:
                        recent_success.pop(0)
                    avarage_success_rate = recent_success.count(1) / len(
                        recent_success)
                    recent_success_rate.append(avarage_success_rate)
                    recent_success_rate_data = np.array(recent_success_rate)
                    d = {"recent_success_rates": recent_success_rate_data}
                    with open(
                            os.path.join("results",
                                         "success_rate_data" + ".pkl"),
                            "wb") as f:
                        pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
                    # print results on the terminal
                    print("Episode total reward:", episode_reward)
                    print("Episode time:", env_steps * 0.1)
                    print("Success rate:", avarage_success_rate)
                    print(episode.index, "episode finished.")
                    buffer.measure_utilization()
                    print("---" * 15)
                    break
                else:
                    state = next_state
                    env_steps += 1
        env.close()
Example #5
0
def test(test_scenarios, sim_name, headless, num_episodes, seed):
    config = HyperParameters()
    configProto = init_tensorflow()
    # init env
    agent_spec = AgentSpec(
        # you can custom AgentInterface to control what obs information you need and the action type
        interface=cross_interface,
        # agent_builder=actor,
        # you can custom your observation adapter, reward adapter, info adapter, action adapter and so on.
        observation_adapter=observation_adapter,
        reward_adapter=reward_adapter,
        action_adapter=action_adapter,
    )

    env = gym.make(
        "smarts.env:hiway-v0",
        scenarios=test_scenarios,
        agent_specs={AGENT_ID: agent_spec},
        sim_name=sim_name,
        headless=headless,
        timestep_sec=0.1,
        seed=seed,
    )
    # init nets structure
    if WITH_SOC_MT:
        model_name = "Soc_Mt_TD3Network"
        actor = SocMtActorNetwork(name="actor")
        critic_1 = SocMtCriticNetwork(name="critic_1")
        critic_2 = SocMtCriticNetwork(name="critic_2")
    else:
        model_name = "TD3Network"
        actor = ActorNetwork(name="actor")
        critic_1 = CriticNetwork(name="critic_1")
        critic_2 = CriticNetwork(name="critic_2")
    saver = tf.compat.v1.train.Saver()
    with tf.compat.v1.Session(config=configProto) as sess:
        # load network
        saver = tf.compat.v1.train.import_meta_graph("models/" + model_name +
                                                     ".ckpt" + ".meta")
        saver.restore(sess, "models/" + model_name + ".ckpt")
        if saver is None:
            print("did not load")

        # init testing params
        test_num = 100
        test_ep = 0
        # results record
        success = 0
        failure = 0
        passed_case = 0

        collision = 0
        trouble_collision = 0
        time_exceed = 0
        episode_time_record = []

        # start testing
        for episode in episodes(n=num_episodes):
            episode_reward = 0
            env_steps = 0  # step in one episode
            observations = env.reset()  # states of all vehs
            state = observations[AGENT_ID]  # ego state
            episode.record_scenario(env.scenario_log)
            dones = {"__all__": False}
            while not dones["__all__"]:
                action = actor.get_action_noise(sess, state, rate=-1)
                observations, rewards, dones, infos = env.step(
                    {AGENT_ID: action})  # states of all vehs in next step

                # ego state in next step
                state = observations[AGENT_ID]
                if WITH_SOC_MT:
                    reward = rewards[AGENT_ID]
                else:
                    reward = np.sum(rewards.values())
                done = dones[AGENT_ID]
                info = infos[AGENT_ID]
                aux_info = get_aux_info(infos[AGENT_ID]["env_obs"])
                episode.record_step(observations, rewards, dones, infos)
                if WITH_SOC_MT:
                    episode_reward += np.sum(reward)
                else:
                    episode_reward += reward
                env_steps += 1

                if done:
                    test_ep += 1
                    # record result
                    if aux_info == "collision":
                        collision += 1
                        failure += 1
                    elif aux_info == "trouble_collision":
                        trouble_collision += 1
                        passed_case += 1
                    elif aux_info == "time_exceed":
                        time_exceed += 1
                        failure += 1
                    else:
                        # get episode time
                        episode_time_record.append(env_steps * 0.1)
                        success += 1
                    # print
                    print(
                        episode.index,
                        "EPISODE ended",
                        "TOTAL REWARD {:.4f}".format(episode_reward),
                        "Result:",
                        aux_info,
                    )
                    print("total step of this episode: ", env_steps)
                    episode_reward = 0
                    env_steps = 0
                    observations = env.reset()  # states of all vehs
                    state = observations[AGENT_ID]  # ego state
        env.close()

        print("-*" * 15, " result ", "-*" * 15)
        print("success: ", success, "/", test_num)
        print("collision: ", collision, "/", test_num)
        print("time_exceed: ", time_exceed, "/", test_num)
        print("passed_case: ", passed_case, "/", test_num)
        print("average time: ", np.mean(episode_time_record))