def __init__(self, env_name, port=2000, gpu=0, train_step=2000, evaluation_step=1000, max_ep_len=1000, polyak=0.995, start_steps=1000, batch_size=100, replay_size=50000, iteration=200, gamma=0.99, act_noise=0.1, target_noise=0.2, noise_clip=0.5, pi_lr=1e-4, q_lr=1e-3, policy_delay=2, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.iteration = iteration self.train_step = train_step self.evaluation_step = evaluation_step self.env = gym.make(env_name) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] self.start_steps = start_steps self.cur_train_step = 0 self.cur_tensorboard_step = 0 self.batch_size = batch_size self.max_ep_len = max_ep_len self.act_limit = self.env.action_space.high[0] self.act_noise = act_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.polyak = polyak self.gamma = gamma self.opti_q = tf.keras.optimizers.Adam(q_lr) self.opti_pi = tf.keras.optimizers.Adam(pi_lr) if debug_mode: self.summary = tf.summary.create_file_writer( os.path.join(self.logger.output_dir, "logs")) self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.target_actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.replay_buffer = ReplayBuffer(replay_size) # self.critic = core.Critic() # net_params = self.critic.weights # self.target_actor_critic.set_weights(self.actor_critic.weights) self.target_init(self.target_actor_critic, self.actor_critic)
def __init__(self, env_name, port=2000, gpu=0, train_step=10000, evaluation_step=3000, max_ep_len=300, polyak=0.995, start_steps=200, batch_size=100, replay_size=50000, iteration=200, gamma=0.99, act_noise=0.1, target_noise=0.2, noise_clip=0.5, pi_lr=1e-4, q_lr=1e-3, policy_delay=2, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.iteration = iteration self.train_step = train_step self.evaluation_step = evaluation_step self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=True, port=port, gpu=gpu, discrete_control=False) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] self.start_steps = start_steps self.cur_train_step = 0 self.cur_tensorboard_step = 0 self.batch_size = batch_size self.max_ep_len = max_ep_len self.act_limit = self.env.action_space.high[0] self.act_noise = act_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.polyak = polyak self.gamma = gamma self.opti_q = tf.keras.optimizers.Adam(q_lr) self.opti_pi = tf.keras.optimizers.Adam(pi_lr) if debug_mode: self.summary = tf.summary.create_file_writer( os.path.join(self.logger.output_dir, "logs")) self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.target_actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.replay_buffer = ReplayBuffer(replay_size) self.loadpath = os.path.join( DEFAULT_DATA_DIR, "saver_0.45_0.45_0.05_0.1_tfaug_shuffle_first") actor = core.ActorCnn() load_check = tf.train.Checkpoint(model=actor) load_check.restore(os.path.join(self.loadpath, "model.ckpt-200")) # with tf.GradientTape() as tape: # x = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim) # x = tf.expand_dims(x, axis=0) # a = tf.random.uniform(minval=0, maxval=1, shape=[self.act_dim]) # a = tf.expand_dims(a, axis=0) # self.actor_critic([x,a]) # self.actor_critic.choose_action(x) # self.target_actor_critic([x,a]) # self.target_actor_critic.choose_action(x) with tf.GradientTape() as tape: img = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim) img = tf.expand_dims(img, axis=0) speed = tf.random.uniform(minval=0, maxval=1, shape=(1, )) speed = tf.expand_dims(speed, axis=0) self.actor_critic.actor([img, speed]) self.target_actor_critic.actor([img, speed]) actor([img, speed]) for old_var, var in zip(actor.variables, self.actor_critic.variables): var.assign(old_var) var = self.actor_critic.actor.trainable_variables old_var = actor.trainable_variables self.target_init(self.target_actor_critic, self.actor_critic) self.savepath = os.path.join(self.logger.output_dir, "saver") checkpoint = tf.train.Checkpoint(model=self.actor_critic, target_model=self.target_actor_critic) self.manager = tf.train.CheckpointManager(checkpoint, directory=self.savepath, max_to_keep=20, checkpoint_name="model.ckpt")
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--exp_name', type=str, default='td3_carla') args = parser.parse_args() logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) filepath = osp.join(logger_kwargs["output_dir"], "saver") config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session("", config=config) sess.run(tf.global_variables_initializer()) actor_critic = core.ActorCritic(act_dim=2) actor_critic.load_weights(osp.join(filepath, "model_30")) # check = tf.train.Checkpoint(model=actor_critic) # check.restore(tf.train.latest_checkpoint(filepath)) # conv1 = actor_critic.get_layer("conv1").output # conv2 = actor_critic.get_layer("conv2").output # conv3 = actor_critic.get_layer("conv3").output # # summary = tf.summary.FileWriter("log") # tf.summary.image("conv1", conv1) # tf.summary.image("conv2", conv2) # tf.summary.image("conv3", conv3) env = CarlaEnv() s_ph = tf.placeholder(dtype=tf.float32, shape=[None, 80, 80, 6])
def __init__(self, env_name, port=2000, gpu=0, train_step=25000, evaluation_step=3000, max_ep_len=6000, alpha=0.35, epsilon_train=0.1, polyak=0.995, start_steps=1000, batch_size=100, replay_size=50000, iteration=200, gamma=0.99, act_noise=0.1, target_noise=0.2, noise_clip=0.5, pi_lr=1e-4, q_lr=1e-4, policy_delay=2, target_update_period=800, update_period=4, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.iteration = iteration self.train_step = train_step self.evaluation_step = evaluation_step self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=False, port=port, gpu=gpu, discrete_control=False) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] self.alpha = alpha self.start_steps = start_steps self.cur_train_step = 0 self.batch_size = batch_size self.max_ep_len = max_ep_len self.act_limit = self.env.action_space.high[0] self.act_noise = act_noise self.policy_delay = policy_delay if debug_mode: self.summary = tf.summary.FileWriter( os.path.join(self.logger.output_dir, "logs")) # self.obs_dim = (30, 30, 3) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ core.placeholders(self.obs_dim, self.act_dim, self.obs_dim, None, None) self.actor_critic = core.ActorCritic(self.act_dim) self.target_actor_critic = core.ActorCritic(self.act_dim) self.q1, self.q2, self.pi = self.actor_critic([self.x_ph, self.a_ph]) self.q1_pi, _, _ = self.actor_critic([self.x_ph, self.pi]) tar_q1, tar_q2, _ = self.target_actor_critic([self.x_ph, self.a_ph]) _, _, pi_targ = self.target_actor_critic([self.x2_ph, self.a_ph]) epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit) q1_targ, q2_targ, _ = self.target_actor_critic([self.x2_ph, a2]) # Main outputs from computation graph # with tf.variable_scope('main'): # # self.pi, self.q1, self.q2, q1_pi = core.cnn_actor_critic(self.x_ph, self.a_ph) # self.pi, self.q1, = core.cnn_actor_critic(self.x_ph, self.a_ph) # self.q2, q1_pi = self.pi, self.q1 # # Target policy network # with tf.variable_scope('target'): # pi_targ, _, _, _ = core.cnn_actor_critic(self.x2_ph, self.a_ph) # # # # Target Q networks # with tf.variable_scope('target', reuse=True): # # Target policy smoothing, by adding clipped noise to target actions # epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) # epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) # a2 = pi_targ + epsilon # a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit) # # # Target Q-values, using action from target policy # _, q1_targ, q2_targ, _ = core.cnn_actor_critic(self.x2_ph, a2) # q1_targ, q2_targ = q1_pi, q1_pi # q3, q4 = q1_pi, q1_pi self.replay_buffer = ReplayBuffer(replay_size) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * min_q_targ) # TD3 losses self.pi_loss = -tf.reduce_mean(self.q1_pi) q1_loss = tf.reduce_mean((self.q1 - backup)**2) q2_loss = tf.reduce_mean((self.q2 - backup)**2) self.q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) # self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi') + get_vars('main/cnn')) # self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q') + get_vars('main/cnn')) self.train_pi_op = pi_optimizer.minimize(self.pi_loss) self.train_q_op = q_optimizer.minimize(self.q_loss) # var = [v.name for v in tf.global_variables()] # v = tf.trainable_variables() var = [v.name for v in tf.trainable_variables()] print(var) if debug_mode: tf.summary.histogram("main/q1", self.q1) tf.summary.histogram("main/q2", self.q2) tf.summary.histogram("main/q1_pi", self.q1_pi) tf.summary.histogram("target/tar_q1", tar_q1) tf.summary.histogram("target/tar_q2", tar_q2) tf.summary.histogram("target/q1_tar", q1_targ) tf.summary.histogram("target/q2_tar", q2_targ) tf.summary.histogram("a/pi", self.pi) tf.summary.histogram("a/pi_tar", pi_targ) tf.summary.histogram("a/a2", a2) for var in tf.trainable_variables(): tf.summary.histogram(var.name, var) tf.summary.scalar("loss_q1", q1_loss) tf.summary.scalar("loss_q2", q2_loss) tf.summary.scalar("loss_q", self.q_loss) tf.summary.scalar("loss_pi", self.pi_loss) self.merge = tf.summary.merge_all() # Polyak averaging for target variables # self.target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) # for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) self.target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip( self.actor_critic.trainable_variables, self.target_actor_critic.trainable_variables) ]) # Initializing targets to match main variables # target_init = tf.group([tf.assign(v_targ, v_main) # for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( self.actor_critic.trainable_variables, self.target_actor_critic.trainable_variables) ]) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session("", config=config) self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init) self.saver = tf.train.Checkpoint(model=self.actor_critic) self.savepath = os.path.join(self.logger.output_dir, "saver") if not os.path.exists(self.savepath): os.makedirs(self.savepath) self.manager = tf.train.CheckpointManager(self.saver, self.savepath, max_to_keep=10)