def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 train_step=2000,
                 evaluation_step=1000,
                 max_ep_len=1000,
                 polyak=0.995,
                 start_steps=1000,
                 batch_size=100,
                 replay_size=50000,
                 iteration=200,
                 gamma=0.99,
                 act_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 pi_lr=1e-4,
                 q_lr=1e-3,
                 policy_delay=2,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.iteration = iteration
        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.env = gym.make(env_name)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]
        self.start_steps = start_steps
        self.cur_train_step = 0
        self.cur_tensorboard_step = 0
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.act_limit = self.env.action_space.high[0]
        self.act_noise = act_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.polyak = polyak
        self.gamma = gamma
        self.opti_q = tf.keras.optimizers.Adam(q_lr)
        self.opti_pi = tf.keras.optimizers.Adam(pi_lr)

        if debug_mode:
            self.summary = tf.summary.create_file_writer(
                os.path.join(self.logger.output_dir, "logs"))

        self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit)
        self.target_actor_critic = core.ActorCritic(self.act_dim,
                                                    self.act_limit)
        self.replay_buffer = ReplayBuffer(replay_size)

        # self.critic = core.Critic()
        # net_params = self.critic.weights
        # self.target_actor_critic.set_weights(self.actor_critic.weights)
        self.target_init(self.target_actor_critic, self.actor_critic)
Example #2
0
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 train_step=10000,
                 evaluation_step=3000,
                 max_ep_len=300,
                 polyak=0.995,
                 start_steps=200,
                 batch_size=100,
                 replay_size=50000,
                 iteration=200,
                 gamma=0.99,
                 act_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 pi_lr=1e-4,
                 q_lr=1e-3,
                 policy_delay=2,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.iteration = iteration
        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.env = CarlaEnv(early_termination_enabled=True,
                            run_offscreen=True,
                            port=port,
                            gpu=gpu,
                            discrete_control=False)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]
        self.start_steps = start_steps
        self.cur_train_step = 0
        self.cur_tensorboard_step = 0
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.act_limit = self.env.action_space.high[0]
        self.act_noise = act_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.polyak = polyak
        self.gamma = gamma
        self.opti_q = tf.keras.optimizers.Adam(q_lr)
        self.opti_pi = tf.keras.optimizers.Adam(pi_lr)

        if debug_mode:
            self.summary = tf.summary.create_file_writer(
                os.path.join(self.logger.output_dir, "logs"))

        self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit)
        self.target_actor_critic = core.ActorCritic(self.act_dim,
                                                    self.act_limit)
        self.replay_buffer = ReplayBuffer(replay_size)

        self.loadpath = os.path.join(
            DEFAULT_DATA_DIR, "saver_0.45_0.45_0.05_0.1_tfaug_shuffle_first")
        actor = core.ActorCnn()
        load_check = tf.train.Checkpoint(model=actor)
        load_check.restore(os.path.join(self.loadpath, "model.ckpt-200"))

        # with tf.GradientTape() as tape:
        #     x = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim)
        #     x = tf.expand_dims(x, axis=0)
        #     a = tf.random.uniform(minval=0, maxval=1, shape=[self.act_dim])
        #     a = tf.expand_dims(a, axis=0)
        #     self.actor_critic([x,a])
        #     self.actor_critic.choose_action(x)
        #     self.target_actor_critic([x,a])
        #     self.target_actor_critic.choose_action(x)
        with tf.GradientTape() as tape:
            img = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim)
            img = tf.expand_dims(img, axis=0)
            speed = tf.random.uniform(minval=0, maxval=1, shape=(1, ))
            speed = tf.expand_dims(speed, axis=0)
            self.actor_critic.actor([img, speed])
            self.target_actor_critic.actor([img, speed])
            actor([img, speed])
        for old_var, var in zip(actor.variables, self.actor_critic.variables):
            var.assign(old_var)
        var = self.actor_critic.actor.trainable_variables
        old_var = actor.trainable_variables

        self.target_init(self.target_actor_critic, self.actor_critic)

        self.savepath = os.path.join(self.logger.output_dir, "saver")
        checkpoint = tf.train.Checkpoint(model=self.actor_critic,
                                         target_model=self.target_actor_critic)
        self.manager = tf.train.CheckpointManager(checkpoint,
                                                  directory=self.savepath,
                                                  max_to_keep=20,
                                                  checkpoint_name="model.ckpt")
Example #3
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--exp_name', type=str, default='td3_carla')
    args = parser.parse_args()

    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    filepath = osp.join(logger_kwargs["output_dir"], "saver")
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session("", config=config)
    sess.run(tf.global_variables_initializer())

    actor_critic = core.ActorCritic(act_dim=2)
    actor_critic.load_weights(osp.join(filepath, "model_30"))
    # check = tf.train.Checkpoint(model=actor_critic)
    # check.restore(tf.train.latest_checkpoint(filepath))

    # conv1 = actor_critic.get_layer("conv1").output
    # conv2 = actor_critic.get_layer("conv2").output
    # conv3 = actor_critic.get_layer("conv3").output
    #
    # summary = tf.summary.FileWriter("log")
    # tf.summary.image("conv1", conv1)
    # tf.summary.image("conv2", conv2)
    # tf.summary.image("conv3", conv3)
    env = CarlaEnv()
    s_ph = tf.placeholder(dtype=tf.float32, shape=[None, 80, 80, 6])
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 train_step=25000,
                 evaluation_step=3000,
                 max_ep_len=6000,
                 alpha=0.35,
                 epsilon_train=0.1,
                 polyak=0.995,
                 start_steps=1000,
                 batch_size=100,
                 replay_size=50000,
                 iteration=200,
                 gamma=0.99,
                 act_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 pi_lr=1e-4,
                 q_lr=1e-4,
                 policy_delay=2,
                 target_update_period=800,
                 update_period=4,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.iteration = iteration
        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.env = CarlaEnv(early_termination_enabled=True,
                            run_offscreen=False,
                            port=port,
                            gpu=gpu,
                            discrete_control=False)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]
        self.alpha = alpha
        self.start_steps = start_steps
        self.cur_train_step = 0
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.act_limit = self.env.action_space.high[0]
        self.act_noise = act_noise
        self.policy_delay = policy_delay

        if debug_mode:
            self.summary = tf.summary.FileWriter(
                os.path.join(self.logger.output_dir, "logs"))

        # self.obs_dim = (30, 30, 3)
        # Inputs to computation graph
        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            core.placeholders(self.obs_dim, self.act_dim, self.obs_dim, None, None)

        self.actor_critic = core.ActorCritic(self.act_dim)
        self.target_actor_critic = core.ActorCritic(self.act_dim)
        self.q1, self.q2, self.pi = self.actor_critic([self.x_ph, self.a_ph])
        self.q1_pi, _, _ = self.actor_critic([self.x_ph, self.pi])

        tar_q1, tar_q2, _ = self.target_actor_critic([self.x_ph, self.a_ph])
        _, _, pi_targ = self.target_actor_critic([self.x2_ph, self.a_ph])
        epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit)
        q1_targ, q2_targ, _ = self.target_actor_critic([self.x2_ph, a2])

        # Main outputs from computation graph
        # with tf.variable_scope('main'):
        #     # self.pi, self.q1, self.q2, q1_pi = core.cnn_actor_critic(self.x_ph, self.a_ph)
        #     self.pi, self.q1, = core.cnn_actor_critic(self.x_ph, self.a_ph)
        #     self.q2, q1_pi = self.pi, self.q1

        # # Target policy network
        # with tf.variable_scope('target'):
        #     pi_targ, _, _, _  = core.cnn_actor_critic(self.x2_ph, self.a_ph)
        #
        #
        # # Target Q networks
        # with tf.variable_scope('target', reuse=True):
        #     # Target policy smoothing, by adding clipped noise to target actions
        #     epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        #     epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        #     a2 = pi_targ + epsilon
        #     a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit)
        #
        #     # Target Q-values, using action from target policy
        #     _, q1_targ, q2_targ, _ = core.cnn_actor_critic(self.x2_ph, a2)
        # q1_targ, q2_targ = q1_pi, q1_pi
        # q3, q4 = q1_pi, q1_pi

        self.replay_buffer = ReplayBuffer(replay_size)

        # Bellman backup for Q functions, using Clipped Double-Q targets
        min_q_targ = tf.minimum(q1_targ, q2_targ)
        backup = tf.stop_gradient(self.r_ph + gamma *
                                  (1 - self.d_ph) * min_q_targ)

        # TD3 losses
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        q1_loss = tf.reduce_mean((self.q1 - backup)**2)
        q2_loss = tf.reduce_mean((self.q2 - backup)**2)
        self.q_loss = q1_loss + q2_loss

        # Separate train ops for pi, q
        pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
        q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
        # self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi') + get_vars('main/cnn'))
        # self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q') + get_vars('main/cnn'))

        self.train_pi_op = pi_optimizer.minimize(self.pi_loss)
        self.train_q_op = q_optimizer.minimize(self.q_loss)

        # var = [v.name for v in tf.global_variables()]
        # v = tf.trainable_variables()
        var = [v.name for v in tf.trainable_variables()]
        print(var)

        if debug_mode:
            tf.summary.histogram("main/q1", self.q1)
            tf.summary.histogram("main/q2", self.q2)
            tf.summary.histogram("main/q1_pi", self.q1_pi)
            tf.summary.histogram("target/tar_q1", tar_q1)
            tf.summary.histogram("target/tar_q2", tar_q2)
            tf.summary.histogram("target/q1_tar", q1_targ)
            tf.summary.histogram("target/q2_tar", q2_targ)

            tf.summary.histogram("a/pi", self.pi)
            tf.summary.histogram("a/pi_tar", pi_targ)
            tf.summary.histogram("a/a2", a2)

            for var in tf.trainable_variables():
                tf.summary.histogram(var.name, var)

            tf.summary.scalar("loss_q1", q1_loss)
            tf.summary.scalar("loss_q2", q2_loss)
            tf.summary.scalar("loss_q", self.q_loss)
            tf.summary.scalar("loss_pi", self.pi_loss)

            self.merge = tf.summary.merge_all()

        # Polyak averaging for target variables
        # self.target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
        #                           for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

        self.target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(
                self.actor_critic.trainable_variables,
                self.target_actor_critic.trainable_variables)
        ])

        # Initializing targets to match main variables
        # target_init = tf.group([tf.assign(v_targ, v_main)
        #                           for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

        target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                self.actor_critic.trainable_variables,
                self.target_actor_critic.trainable_variables)
        ])

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session("", config=config)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(target_init)
        self.saver = tf.train.Checkpoint(model=self.actor_critic)
        self.savepath = os.path.join(self.logger.output_dir, "saver")
        if not os.path.exists(self.savepath):
            os.makedirs(self.savepath)
        self.manager = tf.train.CheckpointManager(self.saver,
                                                  self.savepath,
                                                  max_to_keep=10)