def main(args): hparams = HyperParameters() hparams.summary_dir = FLAGS.summary_dir if FLAGS.summary_dir else hparams.summary_dir if FLAGS.phase == 'train': train_dataset = DataSet(hparams.train_image_dir, hparams.batch_size, [224, 224, 3], len(hparams.license_number_list), include_label=True, shuffle=True, augmented=True) val_dataset = DataSet(hparams.val_image_dir, hparams.batch_size, [224, 224, 3], len(hparams.license_number_list), include_label=True, shuffle=False, augmented=False) # test_dataset = DataSet(hparams.test_image_dir, # hparams.batch_size, [224, 224, 3], # is_train=False, # shuffle=False, # augmented=False) with tf.Session() as sess: model = Recognizer(hparams, trainable=True) model.train(sess, train_dataset=train_dataset, val_dataset=val_dataset, load_checkpoint=FLAGS.load_checkpoint, checkpoint=FLAGS.checkpoint) elif FLAGS.phase == 'eval': test_dataset = DataSet(hparams.test_image_dir, hparams.batch_size, [224, 224, 3], len(hparams.license_number_list), include_label=True, shuffle=False, augmented=False) with tf.Session() as sess: model = Recognizer(hparams, trainable=True) model.eval(sess, test_dataset, checkpoint=FLAGS.checkpoint) else: test_dataset = DataSet(hparams.test_image_dir, hparams.batch_size, [224, 224, 3], len(hparams.license_number_list), include_label=False, shuffle=False, augmented=False) with tf.Session() as sess: model = Recognizer(hparams, trainable=True) model.test(sess, test_dataset, checkpoint=FLAGS.checkpoint)
def __init__(self, name): self.config = HyperParameters() self.all_state_size = self.config.all_state_size self.action_size = self.config.action_size self.tau = self.config.tau initial_learning_rate = self.config.lrc global_step = tf.Variable(0, trainable=False) self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=global_step, decay_steps=200000, decay_rate=0.99, staircase=True, ) self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) self.optimizer_2 = tf.compat.v1.train.AdamOptimizer(self.learning_rate) ( self.state_inputs, self.action, self.critic_variables, self.q_value, ) = self.build_critic_network(name) ( self.state_inputs_target, self.action_target, self.critic_variables_target, self.q_value_target, ) = self.build_critic_network(name + "_target") self.target = tf.compat.v1.placeholder(tf.float32, [None, self.config.task_size]) self.ISWeights = tf.compat.v1.placeholder(tf.float32, [None, 1]) self.absolute_errors = tf.abs(self.target - self.q_value) # for updating sumtree self.action_gradients = tf.gradients(self.q_value, self.action) self.loss = tf.reduce_mean( self.ISWeights * tf.compat.v1.losses.huber_loss( labels=self.target, predictions=self.q_value)) self.loss_2 = tf.reduce_mean( tf.compat.v1.losses.huber_loss(labels=self.target, predictions=self.q_value)) self.optimize = self.optimizer.minimize( self.loss) # global_step=global_step self.optimize_2 = self.optimizer_2.minimize(self.loss_2) self.update_target_op = [ self.critic_variables_target[i].assign( tf.multiply(self.critic_variables[i], self.tau) + tf.multiply(self.critic_variables_target[i], 1 - self.tau)) for i in range(len(self.critic_variables)) ]
def __init__(self, name): # learning params self.config = HyperParameters() self.all_state_size = self.config.all_state_size self.action_size = self.config.action_size self.tau = self.config.tau # network params self.feature_head = 1 self.features_per_head = 64 initial_learning_rate = self.config.lra global_step = tf.Variable(0, trainable=False) self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=global_step, decay_steps=200000, decay_rate=0.99, staircase=True, ) self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) ( self.state_inputs, self.actor_variables, self.action, self.attention_matrix, ) = self.build_actor_network(name) ( self.state_inputs_target, self.actor_variables_target, self.action_target, self.attention_matrix_target, ) = self.build_actor_network(name + "_target") self.action_gradients = tf.compat.v1.placeholder( tf.float32, [None, self.action_size], name="action_gradients") self.actor_gradients = tf.compat.v1.gradients(self.action, self.actor_variables, -self.action_gradients) self.optimize = self.optimizer.apply_gradients( zip(self.actor_gradients, self.actor_variables)) # global_step=global_step self.update_target_op = [ self.actor_variables_target[i].assign( tf.multiply(self.actor_variables[i], self.tau) + tf.multiply(self.actor_variables_target[i], 1 - self.tau)) for i in range(len(self.actor_variables)) ]
def train( training_scenarios, sim_name, headless, num_episodes, seed, without_soc_mt, session_dir, ): WITH_SOC_MT = without_soc_mt config = HyperParameters() configProto = init_tensorflow() # init env agent_spec = AgentSpec( # you can custom AgentInterface to control what obs information you need and the action type interface=cross_interface, # agent_builder=actor, # you can custom your observation adapter, reward adapter, info adapter, action adapter and so on. observation_adapter=observation_adapter, reward_adapter=reward_adapter, action_adapter=action_adapter, ) env = gym.make( "smarts.env:hiway-v0", scenarios=training_scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, timestep_sec=0.1, seed=seed, ) # init nets structure if WITH_SOC_MT: model_name = "Soc_Mt_TD3Network" actor = SocMtActorNetwork(name="actor") critic_1 = SocMtCriticNetwork(name="critic_1") critic_2 = SocMtCriticNetwork(name="critic_2") else: model_name = "TD3Network" actor = ActorNetwork(name="actor") critic_1 = CriticNetwork(name="critic_1") critic_2 = CriticNetwork(name="critic_2") # tensorflow summary for tensorboard visualization writer = tf.compat.v1.summary.FileWriter("summary") # losses tf.compat.v1.summary.scalar("Loss", critic_1.loss) tf.compat.v1.summary.scalar("Hubor_loss", critic_1.loss_2) tf.compat.v1.summary.histogram("ISWeights", critic_1.ISWeights) write_op = tf.compat.v1.summary.merge_all() saver = tf.compat.v1.train.Saver(max_to_keep=1000) # init memory buffer buffer = Buffer(config.buffer_size, config.pretrain_length) if config.load_buffer: # !!!the capacity of the buffer is limited with buffer file buffer = buffer.load_buffer(config.buffer_load_path) print("BUFFER: Buffer Loaded") else: buffer.fill_buffer(env, AGENT_ID) print("BUFFER: Buffer Filled") buffer.save_buffer(config.buffer_save_path, buffer) print("BUFFER: Buffer initialize") with tf.compat.v1.Session(config=configProto) as sess: # init nets params sess.run(tf.compat.v1.global_variables_initializer()) writer.add_graph(sess.graph) # update params of the target network actor.update_target(sess) critic_1.update_target(sess) critic_2.update_target(sess) # Reinforcement Learning loop print("Training Starts...") # experiment results recent_rewards = [] # rewards from recent 100 episodes avarage_rewards = [] # avareage reward of recent 100 episodes recent_success = [] recent_success_rate = [] EPSILON = 1 for episode in episodes(n=num_episodes): env_steps = 0 # save the model from time to time if config.model_save_frequency: if episode.index % config.model_save_frequency == 0: save_path = saver.save(sess, f"{session_dir}/{model_name}.ckpt") print("latest model saved") if episode.index % config.model_save_frequency_no_paste == 0: saver.save( sess, f"{session_dir}/{model_name}_{str(episode.index)}.ckpt", ) print("model saved") # initialize EPSILON = (config.noised_episodes - episode.index) / config.noised_episodes episode_reward = 0 observations = env.reset() # states of all vehs state = observations[AGENT_ID] # ego state episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: action_noise = actor.get_action_noise(sess, state, rate=EPSILON) observations, rewards, dones, infos = env.step( {AGENT_ID: action_noise}) # states of all vehs in next step # ego state in next step next_state = observations[AGENT_ID] if WITH_SOC_MT: reward = rewards[AGENT_ID] else: reward = np.sum(rewards.values()) done = dones[AGENT_ID] info = infos[AGENT_ID] aux_info = get_aux_info(infos[AGENT_ID]["env_obs"]) episode.record_step(observations, rewards, dones, infos) if WITH_SOC_MT: episode_reward += np.sum(reward) else: episode_reward += reward # store the experience experience = state, action_noise, reward, next_state, done # print(state) buffer.store(experience) ## Model training STARTS if env_steps % config.train_frequency == 0: # "Delayed" Policy Updates policy_delayed = 2 for _ in range(policy_delayed): # First we need a mini-batch with experiences (s, a, r, s', done) tree_idx, batch, ISWeights_mb = buffer.sample( config.batch_size) s_mb, a_mb, r_mb, next_s_mb, dones_mb = get_split_batch( batch) task_mb = s_mb[:, -config.task_size:] next_task_mb = next_s_mb[:, -config.task_size:] # Get q_target values for next_state from the critic_target if WITH_SOC_MT: a_target_next_state = actor.get_action_target( sess, next_s_mb) # with Target Policy Smoothing q_target_next_state_1 = critic_1.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state_1 = (q_target_next_state_1 * next_task_mb ) # multi task q value q_target_next_state_2 = critic_2.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state_2 = (q_target_next_state_2 * next_task_mb ) # multi task q value q_target_next_state = np.minimum( q_target_next_state_1, q_target_next_state_2) else: a_target_next_state = actor.get_action_target( sess, next_s_mb) # with Target Policy Smoothing q_target_next_state_1 = critic_1.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state_2 = critic_2.get_q_value_target( sess, next_s_mb, a_target_next_state) q_target_next_state = np.minimum( q_target_next_state_1, q_target_next_state_2) # Set Q_target = r if the episode ends at s+1, otherwise Q_target = r + gamma * Qtarget(s',a') target_Qs_batch = [] for i in range(0, len(dones_mb)): terminal = dones_mb[i] # if we are in a terminal state. only equals reward if terminal: target_Qs_batch.append((r_mb[i] * task_mb[i])) else: # take the Q taregt for action a' target = ( r_mb[i] * task_mb[i] + config.gamma * q_target_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array( [each for each in target_Qs_batch]) # critic train if len(a_mb.shape) > 2: a_mb = np.squeeze(a_mb, axis=1) loss, absolute_errors = critic_1.train( sess, s_mb, a_mb, targets_mb, ISWeights_mb) loss_2, absolute_errors_2 = critic_2.train( sess, s_mb, a_mb, targets_mb, ISWeights_mb) # actor train a_for_grad = actor.get_action(sess, s_mb) a_gradients = critic_1.get_gradients( sess, s_mb, a_for_grad) # print(a_gradients) actor.train(sess, s_mb, a_gradients[0]) # target train actor.update_target(sess) critic_1.update_target(sess) critic_2.update_target(sess) # update replay memory priorities if WITH_SOC_MT: absolute_errors = np.sum(absolute_errors, axis=1) buffer.batch_update(tree_idx, absolute_errors) ## Model training ENDS if done: # visualize reward data recent_rewards.append(episode_reward) if len(recent_rewards) > 100: recent_rewards.pop(0) avarage_rewards.append(np.mean(recent_rewards)) avarage_rewards_data = np.array(avarage_rewards) d = {"avarage_rewards": avarage_rewards_data} with open(os.path.join("results", "reward_data" + ".pkl"), "wb") as f: pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) # visualize success rate data if aux_info == "success": recent_success.append(1) else: recent_success.append(0) if len(recent_success) > 100: recent_success.pop(0) avarage_success_rate = recent_success.count(1) / len( recent_success) recent_success_rate.append(avarage_success_rate) recent_success_rate_data = np.array(recent_success_rate) d = {"recent_success_rates": recent_success_rate_data} with open( os.path.join("results", "success_rate_data" + ".pkl"), "wb") as f: pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) # print results on the terminal print("Episode total reward:", episode_reward) print("Episode time:", env_steps * 0.1) print("Success rate:", avarage_success_rate) print(episode.index, "episode finished.") buffer.measure_utilization() print("---" * 15) break else: state = next_state env_steps += 1 env.close()
def test(test_scenarios, sim_name, headless, num_episodes, seed): config = HyperParameters() configProto = init_tensorflow() # init env agent_spec = AgentSpec( # you can custom AgentInterface to control what obs information you need and the action type interface=cross_interface, # agent_builder=actor, # you can custom your observation adapter, reward adapter, info adapter, action adapter and so on. observation_adapter=observation_adapter, reward_adapter=reward_adapter, action_adapter=action_adapter, ) env = gym.make( "smarts.env:hiway-v0", scenarios=test_scenarios, agent_specs={AGENT_ID: agent_spec}, sim_name=sim_name, headless=headless, timestep_sec=0.1, seed=seed, ) # init nets structure if WITH_SOC_MT: model_name = "Soc_Mt_TD3Network" actor = SocMtActorNetwork(name="actor") critic_1 = SocMtCriticNetwork(name="critic_1") critic_2 = SocMtCriticNetwork(name="critic_2") else: model_name = "TD3Network" actor = ActorNetwork(name="actor") critic_1 = CriticNetwork(name="critic_1") critic_2 = CriticNetwork(name="critic_2") saver = tf.compat.v1.train.Saver() with tf.compat.v1.Session(config=configProto) as sess: # load network saver = tf.compat.v1.train.import_meta_graph("models/" + model_name + ".ckpt" + ".meta") saver.restore(sess, "models/" + model_name + ".ckpt") if saver is None: print("did not load") # init testing params test_num = 100 test_ep = 0 # results record success = 0 failure = 0 passed_case = 0 collision = 0 trouble_collision = 0 time_exceed = 0 episode_time_record = [] # start testing for episode in episodes(n=num_episodes): episode_reward = 0 env_steps = 0 # step in one episode observations = env.reset() # states of all vehs state = observations[AGENT_ID] # ego state episode.record_scenario(env.scenario_log) dones = {"__all__": False} while not dones["__all__"]: action = actor.get_action_noise(sess, state, rate=-1) observations, rewards, dones, infos = env.step( {AGENT_ID: action}) # states of all vehs in next step # ego state in next step state = observations[AGENT_ID] if WITH_SOC_MT: reward = rewards[AGENT_ID] else: reward = np.sum(rewards.values()) done = dones[AGENT_ID] info = infos[AGENT_ID] aux_info = get_aux_info(infos[AGENT_ID]["env_obs"]) episode.record_step(observations, rewards, dones, infos) if WITH_SOC_MT: episode_reward += np.sum(reward) else: episode_reward += reward env_steps += 1 if done: test_ep += 1 # record result if aux_info == "collision": collision += 1 failure += 1 elif aux_info == "trouble_collision": trouble_collision += 1 passed_case += 1 elif aux_info == "time_exceed": time_exceed += 1 failure += 1 else: # get episode time episode_time_record.append(env_steps * 0.1) success += 1 # print print( episode.index, "EPISODE ended", "TOTAL REWARD {:.4f}".format(episode_reward), "Result:", aux_info, ) print("total step of this episode: ", env_steps) episode_reward = 0 env_steps = 0 observations = env.reset() # states of all vehs state = observations[AGENT_ID] # ego state env.close() print("-*" * 15, " result ", "-*" * 15) print("success: ", success, "/", test_num) print("collision: ", collision, "/", test_num) print("time_exceed: ", time_exceed, "/", test_num) print("passed_case: ", passed_case, "/", test_num) print("average time: ", np.mean(episode_time_record))