class DDPG(object): """deep deterministic policy gradient """ def __init__(self, n_state, n_action, a_bound, gamma=0.99, tau=0.01, actor_lr=0.0005, critic_lr=0.001, noise_std=0.1, noise_decay=0.9995, noise_decay_steps=1000, buffer_size=20000, save_interval=5000, assess_interval=10, logger=None, checkpoint_queen=None): self.logger = logger self.logger.save_config(locals()) self.n_action = n_action self.n_state = n_state self.a_bound = a_bound self.noise_std = noise_std self.noise_decay = noise_decay self.noise_decay_steps = noise_decay_steps self.pointer = 0 self.buffer_size = buffer_size self.save_interval = save_interval self.assess_interval = assess_interval self.actor = Actor(self.n_state, self.n_action, gamma=gamma, lr=actor_lr, tau=tau, l2_reg=0) self.critic = Critic(self.n_state, self.n_action, gamma=gamma, lr=critic_lr, tau=tau, l2_reg=0) self.merge = self._merge_summary() self.ckpt_queen = checkpoint_queen self.prefix = self.__class__.__name__.lower() def _merge_summary(self): tf.summary.histogram('critic_output', self.critic.model.output) tf.summary.histogram('actor_output', self.actor.model.output) tf.summary.histogram('critic_dense1', self.critic.model.get_layer('l1').weights[0]) tf.summary.histogram('actor_dense1', self.actor.model.get_layer('l1').weights[0]) tf.summary.histogram('critic_dense2', self.critic.model.get_layer('l2').weights[0]) tf.summary.histogram('actor_dense2', self.actor.model.get_layer('l2').weights[0]) return tf.summary.merge_all() def policy_action(self, state): return self.actor.predict(state) def bellman_q_value(self, rewards, q_nexts, dones): """ Use the Bellman Equation to compute the critic target """ q_target = np.zeros_like( rewards) # asarry( copy = False), array(cope=True) for i in range(rewards.shape[0]): if dones[i]: q_target[i] = rewards[i] else: q_target[i] = rewards[i] + self.critic.gamma * q_nexts[i] return q_target def update_model(self, states, actions, q_values): # train critic loss_names, loss_values = self.critic.train_on_batch( states, actions, q_values) # train actor # p_actions = self.actor.predict(states) #actions with no noise grad_ys = self.critic.gradients( states, self.actor.predict(states)) #(batch, n-action) actor_output = self.actor.train(states, self.actor.predict(states), grad_ys) # copy network self.actor.copy_weights() self.critic.copy_weights() # print(grad_ys, grad_ys.shape) # print(actor_output[0],actor_output[0].shape) # print(np.mean(grad_ys*actor_output[0])) return loss_names, loss_values, grad_ys, actor_output def save_weights(self, path): self.actor.save(path) self.critic.save(path) def save_model(self, path, file): self.actor.model.save( os.path.join(path, self.prefix + '_actor_' + file + '.h5')) self.critic.model.save( os.path.join(path, self.prefix + '_critic_' + file + '.h5')) def checkpoint(self, path, step, metric_value): signature = str(step) + '_' + '{:.4f}'.format(metric_value) to_delete, need_save = self.ckpt_queen.add((metric_value, signature)) if to_delete: actor = os.path.join( path, self.prefix + '_actor_' + to_delete[1] + '.h5') critic = os.path.join( path, self.prefix + '_critic_' + to_delete[1] + '.h5') os.remove(actor) os.remove(critic) if need_save: self.save_model(path, signature) def train(self, args, summary_writer, train_data=None, val_data=None, test_data=None): results = [] max_val_rate = 0 val_data = np.asarray(val_data) # none will be array(None) # First, gather experience tqdm_e = tqdm(range(args.batchs), desc='score', leave=True, unit=" epoch") if train_data is None: dataset = CsvBuffer(args.file_dir, args.reg_pattern, chunksize=args.batch_size) # 100*(20+1) assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available' # noise = OrnsteinUhlenbeckProcess(size=self.n_action) else: dataset = Dataset(train_data, args.batch_size, shuffle=True) for e in tqdm_e: batch_data = next(dataset) states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int) a = self.policy_action(states) #(batch, n_action) a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape), self.a_bound[0], self.a_bound[1]) # a = np.clip(np.random.normal(a, self.noise_std), self.a_bound[0], self.a_bound[1]) # a = np.clip(a + noise.generate(time, a.shape[0]), self.a_bound[0], self.a_bound[1]) llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5) r = np.where(labels == 1, llr.ravel(), -llr.ravel()) #(batch,) # q_nexts = self.critic.target_predict(new_states, self.actor.target_predict(new_states)) q_ = self.bellman_q_value(rewards=r, q_nexts=0, dones=[True] * r.shape[0]) #(batch,) loss_names, loss_values, grad_ys, actor_output = self.update_model( states, a, q_.reshape(-1, 1)) score = r.mean() if ((e + 1) % self.noise_decay_steps - 1) == 0: self.noise_std *= self.noise_decay self.logger.log_tabular('noise', self.noise_std) if e % self.assess_interval == 0 or e == args.batchs - 1: if val_data is not None: val_pred = self.actor.predict(val_data[:, :-1]) val_y = val_data[:, -1] val_rate, top_k = top_ratio_hit_rate( val_y.ravel(), val_pred.ravel()) self.logger.log_tabular('val_rate', val_rate) self.logger.log_tabular('val_k', int(top_k)) self.checkpoint(args.model_path, e, val_rate) max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate if test_data is not None: test_pred = self.actor.predict(test_data[:, :-1]) test_y = test_data[:, -1] test_rate, top_k = top_ratio_hit_rate( test_y, test_pred.ravel()) self.logger.log_tabular('test_rate', test_rate) self.logger.log_tabular('test_k', int(top_k)) summary_writer.add_summary(tf_summary(['mean-reward'], [score]), global_step=e) summary_writer.add_summary(tf_summary(loss_names, [loss_values]), global_step=e) merge = keras.backend.get_session().run( self.merge, feed_dict={ self.critic.model.input[0]: states, self.critic.model.input[1]: a, self.actor.model.input: states }) summary_writer.add_summary(merge, global_step=e) for name, val in zip(loss_names, [loss_values]): self.logger.log_tabular(name, val) # print(grad_ys,grad_ys.shape) # print(actor_output) self.logger.log_tabular( 'dQ/da', '%.4f+%.4f' % (grad_ys.mean(), grad_ys.std())) # grad_ys (batch,act_dim) self.logger.log_tabular( 'aout', '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std())) self.logger.log_tabular('aloss', '%.4f' % (actor_output[1])) self.logger.log_tabular('reward', '%.4f+%.4f' % (score, r.std())) self.logger.dump_tabular() tqdm_e.set_description("score: " + '{:.4f}'.format(score)) tqdm_e.set_postfix(noise_std='{:.4f}'.format(self.noise_std), max_val_rate='{:.4f}'.format(max_val_rate), val_rate='{:.4f}'.format(val_rate), top_k=top_k) tqdm_e.refresh() return results
def run_carla_client(args): # Here we will run 3 episodes with 300 frames each. number_of_episodes = 60000 frames_per_episode = 400 # We assume the CARLA server is already waiting for a client to connect at # host:port. To create a connection we can use the `make_carla_client` # context manager, it creates a CARLA client object and starts the # connection. It will throw an exception if something goes wrong. The # context manager makes sure the connection is always cleaned up on exit. with make_carla_client(args.host, args.port, 30) as client: print('CarlaClient connected') # ============================================================================= # Global initialisations # ============================================================================= config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) state_size = { 'state_2D': ( 64, 64, 9, ), 'state_1D': (17, ) } action_size = (5, ) critic = Critic(sess, state_size, action_size, CRITIC_LR) critic.target_train() actor = Actor(sess, state_size, action_size, ACTOR_LR) actor.target_train() memory = ExperienceMemory(100000, False) target_update_counter = 0 target_update_freq = TARGET_UPDATE_BASE_FREQ explore_rate = 0.2 success_counter = 0 total_t = 0 t = 0 #NOTE Ez csak egy próba, eztmég át kell alakítani target = { 'pos': np.array([-3.7, 236.4, 0.9]), 'ori': np.array([0.00, -1.00, 0.00]) } if args.settings_filepath is None: # Create a CarlaSettings object. This object is a wrapper around # the CarlaSettings.ini file. Here we set the configuration we # want for the new episode. settings = CarlaSettings() settings.set(SynchronousMode=True, SendNonPlayerAgentsInfo=True, NumberOfVehicles=0, NumberOfPedestrians=0, WeatherId=random.choice([1]), QualityLevel=args.quality_level) # settings.randomize_seeds() # # settings.randomize_seeds() # The default camera captures RGB images of the scene. camera0 = Camera('CameraRGB') # Set image resolution in pixels. camera0.set_image_size(64, 64) # Set its position relative to the car in centimeters. camera0.set_position(0.30, 0, 1.30) settings.add_sensor(camera0) else: # Alternatively, we can load these settings from a file. with open(args.settings_filepath, 'r') as fp: settings = fp.read() scene = client.load_settings(settings) # ============================================================================= # EPISODES LOOP # ============================================================================= for episode in range(0, number_of_episodes): # Start a new episode. # Choose one player start at random. number_of_player_starts = len(scene.player_start_spots) player_start = random.randint(0, max(0, number_of_player_starts - 1)) player_start = 0 total_reward = 0. # Notify the server that we want to start the episode at the # player_start index. This function blocks until the server is ready # to start the episode. print('Starting new episode...') client.start_episode(player_start) #TODO Ezen belül kéne implementálni a tanuló algoritmusunkat # ============================================================================= # Episodic intitialisations # ============================================================================= collisions = {'car': 0, 'ped': 0, 'other': 0} reverse = -1.0 measurements, sensor_data = client.read_data() state = get_state_from_data(measurements, sensor_data, reverse) goal = get_goal_from_data(target) t = 0 stand_still_counter = 0 # ============================================================================= # STEPS LOOP # ============================================================================= for frame in range(0, frames_per_episode): t = t + 1 total_t += 1 target_update_counter += 1 explore_dev = 0.6 / (1 + total_t / 30000) explore_rate = 0.3 / (1 + total_t / 30000) # Print some of the measurements. # print_measurements(measurements) # Save the images to disk if requested. if args.save_images_to_disk and False: for name, measurement in sensor_data.items(): filename = args.out_filename_format.format( episode, name, frame) measurement.save_to_disk(filename) if state['state_1D'][9] < 5 and t > 50: stand_still_counter += 1 else: stand_still_counter = 0 #Calculate the action a_pred = actor.model.predict([ np.expand_dims(state['state_2D'], 0), np.expand_dims(np.concatenate((state['state_1D'], goal)), 0) ])[0] #Add exploration noise to action a = add_noise(a_pred, explore_dev, explore_rate) control = get_control_from_a(a) #Sendcontrol to the server client.send_control(control) # # ============================================================================= # TRAINING THE NETWORKS # ============================================================================= if memory.num_items > 6000: batch, indeces = memory.sample_experience(MINI_BATCH_SIZE) raw_states = [[e[0]['state_2D'], e[0]['state_1D']] for e in batch] goals = np.asarray([e[5] for e in batch]) states = { 'state_2D': np.atleast_2d(np.asarray([e[0] for e in raw_states[:]])), 'state_1D': np.atleast_2d( np.asarray([ np.concatenate([e[1], goals[i]], axis=-1) for i, e in enumerate(raw_states[:]) ])) } actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([np.sum(e[2]) for e in batch]).reshape(-1, 1) raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']] for e in batch] new_states = { 'state_2D': np.atleast_2d( np.asarray([e[0] for e in raw_new_states[:]])), 'state_1D': np.atleast_2d( np.asarray([ np.concatenate([e[1], goals[i]], axis=-1) for i, e in enumerate(raw_new_states[:]) ])) } overs = np.asarray([e[4] for e in batch]).reshape(-1, 1) best_a_preds = actor.target_model.predict( [new_states['state_2D'], new_states['state_1D']]) max_qs = critic.target_model.predict([ new_states['state_2D'], new_states['state_1D'], best_a_preds ]) ys = rewards + (1 - overs) * GAMMA * max_qs #Train Critic network critic.model.train_on_batch( [states['state_2D'], states['state_1D'], actions], ys) #Train Actor network a_for_grads = actor.model.predict( [states['state_2D'], states['state_1D']]) a_grads = critic.gradients(states, a_for_grads) actor.train(states, a_grads) #Train target networks if target_update_counter >= int(target_update_freq): target_update_counter = 0 target_update_freq = target_update_freq * TARGET_UPDATE_MULTIPLIER critic.target_train() actor.target_train() # ============================================================================= # GET AND STORE OBSERVATIONS # ============================================================================= #Get next measurements measurements, sensor_data = client.read_data() new_state = get_state_from_data(measurements, sensor_data, reverse, state) #TODO Calculate reward r_goal, success = calculate_goal_reward( np.atleast_2d(new_state['state_1D']), goal) r_general, collisions = calculate_general_reward( measurements, collisions) over = stand_still_counter > 30 or success success_counter += int(bool(success) * 1) total_reward += r_goal total_reward += r_general #Store observation if t > 10: experience = pd.DataFrame( [[ state, a, np.array([r_goal, r_general]), new_state, bool(over), goal, episode, 0 ]], columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'], copy=True) memory.add_experience(experience) #Set the state to the next state state = new_state if over: break sub_goal = deepcopy(state['state_1D'][0:6]) print(str(episode) + ". Episode###################") print("Total reward: " + str(total_reward)) print("Success counter: " + str(success_counter)) if (episode % 10 == 0): print("############## DEBUG LOG ################") print("Memory state: " + str(memory.num_items)) print("Target update counter: " + str(target_update_counter)) print("Exploration rate: " + str(explore_rate)) print("Exploration dev: " + str(explore_dev)) print("Total timesteps: " + str(total_t)) print("Average episode length: " + str(total_t / (episode + 1))) print("#########################################") # ============================================================================= # REPLAY FOR SUBGOALS # ============================================================================= batch = memory.get_last_episode(t) raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']] for e in batch] new_states = { 'state_2D': np.atleast_2d(np.asarray([e[0] for e in raw_new_states[:]])), 'state_1D': np.atleast_2d(np.asarray([e[1] for e in raw_new_states[:]])) } rewards = np.asarray([e[2] for e in batch]).reshape(-1, 2) r_subgoal = calculate_goal_reward(new_states['state_1D'], sub_goal)[0] rewards[:, 0] = r_subgoal subgoal_batch = [[ v[0], v[1], list(rewards)[i], v[3], v[4], sub_goal, v[6], v[7] ] for i, v in enumerate(batch)] experiences = pd.DataFrame( subgoal_batch, columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'], copy=True) memory.add_experience(experiences)
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, act_dim, env_dim, act_range, buffer_size = 20000, gamma=0.99, lr=0.00005, tau=0.001): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = env_dim self.gamma = gamma # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.demo_actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = Replay() self.batch_size = 2000 def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target """ critic_target = np.asarray(q_values) # if dones: # critic_target[0] = rewards # else: # critic_target[0] = rewards + self.gamma * q_values for i in range(q_values.shape[0]): if dones: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ self.buffer.append((state, action, reward, done, new_state)) def sample_batch(self): return self.buffer.sample_batch(self.batch_size) def update_models(self, states, actions, critic_target, actor_res, demo_actor_res): """ Update actor and critic networks from sampled experience """ # Train critic # print('critic_target', critic_target) self.critic.train_on_batch(states, actions, critic_target, actor_res, demo_actor_res) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) demo_actions = self.demo_actor.model.predict(states) demo_grads = self.critic.gradients(states, demo_actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.act_dim))) self.demo_actor.train(states, demo_actions, np.array(demo_grads).reshape((-1, self.act_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.demo_actor.transfer_weights() self.critic.transfer_weights() def train(self, env): results = [] # First, gather experience tqdm_e = tqdm(range(50000), desc='Score', leave=True, unit=" episodes") success = [] for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) blockPos, blockOrn = p.getBasePositionAndOrientation(env.blockUid) experience = [] while not done: # Actor picks an action (following the deterministic policy) old_state = get_state(env) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment gripperState = p.getLinkState(env._kuka.kukaUid, env._kuka.kukaGripperIndex) gripperPos = gripperState[0] gripperOrn = gripperState[1] a = np.clip(a+noise.generate(time), -self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) new_state = get_state(env) gripperState = p.getLinkState(env._kuka.kukaUid, env._kuka.kukaGripperIndex) next_gripperPos = gripperState[0] next_gripperOrn = gripperState[1] # Add outputs to memory buffer experience.append((old_state, a, r, done, new_state, gripperPos, gripperOrn, next_gripperPos, next_gripperOrn, blockPos, blockOrn)) # self.memorize(old_state, a, r, done, new_state) # HER replay, sample a new goal blockPos, blockOrn = gripperPos, gripperOrn step_size = len(experience) her_experience = [] for t in range(step_size): old_state, action, reward, done, next_state, gripperPos, gripperOrn, next_gripperPos, next_gripperOrn, _, _ = np.copy(experience[t]) blockInGripperPosXYEulZ = env.get_block_in_gripper_pos(gripperPos, gripperOrn, blockPos, blockOrn) old_state[6:9] = blockInGripperPosXYEulZ next_blockInGripperPosXYEulZ = env.get_block_in_gripper_pos(next_gripperPos, next_gripperOrn, blockPos, blockOrn) next_state[6:9] = next_blockInGripperPosXYEulZ if t == step_size - 1: reward = 0.5 her_experience.append((old_state, action, reward, done, next_state, gripperPos, gripperOrn, next_gripperPos, next_gripperOrn, blockPos, blockOrn)) self.train_batch() # Update current state old_state = new_state if r > 0: print('r', r) success.append(e) print(success) cumul_reward += r time += 1 self.buffer.memo.extend(experience) self.buffer.demo_memo.extend(her_experience) # Gather stats every episode for plotting tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results def train_batch(self): if len(self.buffer.memo) > self.batch_size and len(self.buffer.demo_memo) > self.batch_size: # Sample experience from buffer sample_batch, sample_demo_batch = self.sample_batch() states = [] actions = [] rewards = [] dones = [] new_states = [] samples_size = len(sample_batch) for state, action, reward, done, new_state, _, _, _, _, _, _ in sample_batch: states.append(state) actions.append(action) rewards.append(reward) dones.append(done) new_states.append(new_state) new_states = np.reshape(np.array(new_states), (samples_size, -1)) actor_res = self.actor.target_predict(new_states) demo_actor_res = self.demo_actor.target_predict(new_states) q_values = self.critic.target_predict([new_states, actor_res])[0] q_values = np.reshape(q_values, (samples_size, )) critic_targets = self.bellman(rewards, q_values, dones) states = np.array(states) actions = np.array(actions) self.update_models(states, actions, critic_targets, actor_res, demo_actor_res)
class DDPG(object): """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, action_dim, state_dim, batch_size, step, buffer_size, train_indicator, episode, gamma, lra, lrc, tau, load_weight=True): """ Initialization """ # Environment and A2C parameters self.action_dim = action_dim self.state_dim = state_dim self.batch_size = batch_size self.step = step self.gamma = gamma self.lra = lra self.lrc = lrc self.tau = tau self.episode = episode self.train_indicator = train_indicator # Create actor and critic networks self.actor = Actor(state_dim, action_dim, batch_size, lra, tau) self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau) self.buffer = MemoryBuffer(buffer_size) # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5" if load_weight: try: weights_actor_path = "" weights_critic_path = "" weights_file_path = glob.glob(self.weights_dir_path) for file_path in weights_file_path: if file_path.find("actor") < 0: weights_critic_path = file_path if file_path.find("critic") < 0: weights_actor_path = file_path self.load_weights(weights_actor_path, weights_critic_path) print("") print("Actor-Critic Models are loaded with weights...") print("") except: print("") print( "Weights are failed to be loaded, please check weights loading path..." ) print("") def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target (one action only) """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state_old, action, reward, done, state_new): """ Store experience in memory buffer """ self.buffer.memorize(state_old, action, reward, done, state_new) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.action_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def run(self, env): # First, gather experience for e in range(self.episode): # Reset episode # set initial state loss, cumul_reward, cumul_loss = 0, 0, 0 done = False state_old = env.get_vissim_state( 1, 180 * 5, [45, 55, 60, 65, 70, 75, 80 ]) #TODO: make sure states are recieved correctly actions, states, rewards = [], [], [] print("Episode: ", e, " ========================:") for t in range(self.step): action_original = self.policy_action(state_old) #TODO: OU function params? noise = OrnsteinUhlenbeckProcess(x0=action_original, size=self.action_dim) # action = action_orig + noise action = noise.apply_ou(t) # adjust too-low or too-high action adj_action = np.zeros(len(action)) for index, value in enumerate(action): adj_action[index] = clip(value, -1, 1) #action_mapping function transformed_action = Transformation.convert_actions(adj_action) reward, state_new = env.get_vissim_reward( 180 * 5, transformed_action) # TODO: if we know what the optimal discharging rate, then we set that as done if t == self.step - 1: #we consider the manually setted last step as done done = True # ======================================================================================= Training section if (self.train_indicator): # Add outputs to memory buffer self.memorize(state_old, adj_action, reward, done, state_new) # Sample experience from buffer states_old, actions, rewards, dones, states_new = self.sample_batch( self.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [states_new, self.actor.target_predict(states_new)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states_old, actions, critic_target) # calculate loss loss = self.critic.train_on_batch(states_old, actions, critic_target) state_old = state_new cumul_reward += reward cumul_loss += loss # ======================================================================================= # ======================================================================================= report print("|---> Step: ", t, " | Action: ", transformed_action, " | Reward: ", reward, " | Loss: ", loss) # ======================================================================================= # ======================================================================================= save model if np.mod(e, 10) == 0: print("====================> Saving model...") self.save_weights("./saved_model/") """ with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) """ # ======================================================================================= save model print("") print("*-------------------------------------------------*") print("Average Accumulated Reward: " + str(cumul_reward / self.step)) print("Average Accumulated Loss: " + str(cumul_loss / self.step)) print("*-------------------------------------------------*") print("") # garbage recycling gc.collect() def save_weights(self, path): t = datetime.datetime.now() time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str( t.minute) + "m" path_actor = path + '_LR_{}'.format(self.lra) + time path_critic = path + '_LR_{}'.format(self.lrc) + time self.actor.save(path_actor) self.critic.save(path_critic) def load_weights(self, path_actor, path_critic): self.actor.load(path_actor) self.critic.load(path_critic)
class TD3(object): """deep deterministic policy gradient """ def __init__(self, n_state, n_action, a_bound, discount=0.99, tau=0.05, actor_lr=0.001, critic_lr=0.001, policy_freq=2, exp_noise_std=0.1, noise_decay=0.9995, noise_decay_steps=1000, smooth_noise_std=0.1, clip=0.2, buffer_size=20000, save_interval=5000, assess_interval=20, logger=None, checkpoint_queen=None): #self.__dict__.update(locals()) self.logger = logger self.logger.save_config(locals()) self.n_action = n_action self.n_state = n_state self.a_bound = a_bound self.noise_std = exp_noise_std self.noise_decay = noise_decay self.noise_decay_steps = noise_decay_steps self.policy_freq = policy_freq self.smooth_noise_std = smooth_noise_std self.clip = clip self.discount = discount self.pointer = 0 self.buffer = MemoryBuffer(buffer_size, with_per=True) self.save_interval = save_interval self.assess_interval = assess_interval self.actor = Actor(self.n_state, self.n_action, gamma=discount, lr=actor_lr, tau=tau) self.critic1 = Critic(self.n_state, self.n_action, gamma=discount, lr=critic_lr, tau=tau) self.critic2 = Critic(self.n_state, self.n_action, gamma=discount, lr=critic_lr, tau=tau) self.merge = self._merge_summary() self.ckpt_queen = checkpoint_queen self.prefix = self.__class__.__name__ def _merge_summary(self): tf.summary.histogram('critic_output', self.critic1.model.output) tf.summary.histogram('actor_output', self.actor.model.output) tf.summary.histogram('critic_dense1', self.critic1.model.get_layer('l1').weights[0]) tf.summary.histogram('actor_dense1', self.actor.model.get_layer('l1').weights[0]) tf.summary.histogram('critic_dense2', self.critic1.model.get_layer('l2').weights[0]) tf.summary.histogram('actor_dense2', self.actor.model.get_layer('l2').weights[0]) return tf.summary.merge_all() def select_action(self, state): return self.actor.predict(state) def bellman_q_value(self, rewards, q_nexts, dones): """ Use the Bellman Equation to compute the critic target """ q_target = np.zeros_like( rewards) #asarry( copy = False), array(cope=True) for i in range(rewards.shape[0]): if dones[i]: q_target[i] = rewards[i] else: q_target[i] = rewards[i] + self.discount * q_nexts[i] return q_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ if (self.buffer.with_per): q_val = reward q_val_t = self.critic1.target_predict(state, action) td_error = abs(q_val_t - q_val)[0] # print(td_error) else: td_error = 0 state = state.reshape(-1) action = action.reshape(-1) self.buffer.memorize(state, action, reward, done, new_state, td_error) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_actor(self, states): actions = self.actor.predict(states) grad_ys = self.critic1.gradients(states, actions) actor_output = self.actor.train(states, actions, grad_ys) self.actor.copy_weights() self.critic1.copy_weights() self.critic2.copy_weights() return grad_ys, actor_output def update_critic(self, states, actions, q_values): loss_names, loss_values = self.critic1.train_on_batch( states, actions, q_values) self.critic2.train_on_batch(states, actions, q_values) return loss_names, loss_values def save_weights(self, path): self.actor.save(path) self.critic1.save(path) self.critic2.save(path) def save_model(self, path, file): self.actor.model.save( os.path.join(path, self.prefix + '_actor_' + file + '.h5')) self.critic1.model.save( os.path.join(path, self.prefix + '_critic1_' + file + '.h5')) self.critic2.model.save( os.path.join(path, self.prefix + '_critic2_' + file + '.h5')) def checkpoint(self, path, step, metric_value): signature = str(step) + '_' + '{:.4}'.format(metric_value) to_delete, need_save = self.ckpt_queen.add((metric_value, signature)) if to_delete: delete_actor = os.path.join( path, self.prefix + '_actor_' + to_delete[1] + '.h5') delete_critic1 = os.path.join( path, self.prefix + '_critic1_' + to_delete[1] + '.h5') delete_critic2 = os.path.join( path, self.prefix + '_critic2_' + to_delete[1] + '.h5') os.remove(delete_actor) os.remove(delete_critic1) os.remove(delete_critic2) if need_save: self.save_model(path, signature) def train(self, args, summary_writer, train_data=None, val_data=None, test_data=None): results = [] max_val_rate = 0 val_data = np.asarray(val_data) # none will be array(None) # First, gather experience tqdm_e = tqdm(range(args.batchs), desc='score', leave=True, unit="epoch") if train_data is None: dataset = CsvBuffer(args.file_dir, args.reg_pattern, chunksize=args.batch_size) # 100*(20+1) assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available' # noise = OrnsteinUhlenbeckProcess(size=self.n_action) else: dataset = Dataset(train_data, 1, shuffle=True) warm_up = 20 * args.batch_size for e in tqdm_e: batch_data = next(dataset) states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int) a = self.select_action(states) #(batch, n_action) a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape), self.a_bound[0], self.a_bound[1]) llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5) # rewards = np.where(labels==1, llr.ravel(), -llr.ravel()) #(batch,) rewards = np.where(labels == 1, np.where(llr > 0, llr.ravel(), 2 * llr.ravel()), np.where(llr < 0, -llr.ravel(), -2 * llr.ravel())) #(batch,) # print(rewards) # a_ = self.actor.target_predict(next_states) # noise = np.clip(np.random.normal(0, self.smooth_noise_std), 0, self.clip) # a_ = a_ + noise # q_next1 = self.critic1.target_predict(new_states, a_) # q_next2 = self.critic2.target_predict(new_states,a_) # q_nexts = np.where(q_next1<q_next2, q_next1, q_next2) self.memorize(states, a, rewards, True, None) if e < warm_up: continue states, a, rewards, _, _, _ = self.sample_batch(args.batch_size) # print(states.shape, a.shape, rewards.shape) q_ = self.bellman_q_value(rewards=rewards, q_nexts=0, dones=[True] * rewards.shape[0]) #(batch,) loss_names, loss_values = self.update_critic( states, a, q_.reshape(-1, 1)) if e % self.policy_freq == 0 or e == warm_up: grad_ys, actor_output = self.update_actor(states) if ((e + 1) % self.noise_decay_steps - 1) == 0 or e == warm_up: self.noise_std *= self.noise_decay self.logger.log_tabular('noise', self.noise_std) if e % self.assess_interval == 0 or e == args.batchs - 1 or e == warm_up: if val_data is not None: val_pred = self.actor.predict(val_data[:, :-1]) val_y = val_data[:, -1] # print(val_pred.shape,val_pred[:10]) # print(val_y.shape, val_y[:10]) val_rate, top_k = top_ratio_hit_rate( val_y.ravel(), val_pred.ravel()) self.logger.log_tabular('val_rate', val_rate) self.logger.log_tabular('val_k', int(top_k)) self.checkpoint(args.model_path, e, val_rate) max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate if test_data is not None: test_pred = self.actor.predict(test_data[:, :-1]) test_y = test_data[:, -1] test_rate, top_k = top_ratio_hit_rate( test_y, test_pred.ravel()) self.logger.log_tabular('test_rate', test_rate) self.logger.log_tabular('test_k', int(top_k)) score = rewards.mean() summary_writer.add_summary(tf_summary(['mean-reward'], [score]), global_step=e) summary_writer.add_summary(tf_summary(loss_names, [loss_values]), global_step=e) merge = keras.backend.get_session().run( self.merge, feed_dict={ self.critic1.model.input[0]: states, self.critic1.model.input[1]: a, self.actor.model.input: states }) summary_writer.add_summary(merge, global_step=e) for name, val in zip(loss_names, [loss_values]): self.logger.log_tabular(name, val) self.logger.log_tabular( 'dQ/da', '%.4f+%.4f' % (grad_ys.mean(), grad_ys.std())) # grad_ys (batch,act_dim) self.logger.log_tabular( 'aout', '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std())) self.logger.log_tabular('aloss', '%.4f' % (actor_output[1])) self.logger.log_tabular('reward', '%.4f+%.4f' % (score, rewards.std())) self.logger.dump_tabular() tqdm_e.set_description("score: " + '{:.4f}'.format(score)) tqdm_e.set_postfix(noise_std='{:.4}'.format(self.noise_std), max_val_rate='{:.4}'.format(max_val_rate), val_rate='{:.4}'.format(val_rate), top_k=top_k) tqdm_e.refresh() return results
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input # np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # G = tf.Graph() # sess = tf.Session(config=config) # tf.reset_default_graph() tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = Actor(sess, BATCH_SIZE, TAU, LRA) critic = Critic(sess, BATCH_SIZE, TAU, LRC) sess.run(tf.global_variables_initializer()) # actor = Actor( BATCH_SIZE, TAU, LRA) # critic = Critic( BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # print("Now we load the weight") # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # print("hey",s_t.shape) a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print("hey",new_states.shape) target_q_values = critic.target_predict( [new_states, actor.target_predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): # print("main",states.dtype) # print("main",actions.dtype) # print("main",y_t.dtype) loss += critic.train([states, actions], y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) # print("grads : ",grads.dtype) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(step, 1000): print("target_q_values : ", target_q_values) if np.mod(i, 3) == 0: if (train_indicator): pass # saver = tf.train.Saver() # saver.save(sess, save_path = 'weights/model.ckpt',global_step=1000) # print("Now we save model") # actor.model.save_weights("actormodel.h5", overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) # critic.model.save_weights("criticmodel.h5", overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
class DDPG(): """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, act_dim, env_dim, act_range, k, buffer_size=10000, gamma=0.99, lr=0.001, tau=0.001): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = (1, ) + (13, ) self.gamma = gamma # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) # self.buffer = MemoryBuffer(buffer_size) self.buffer = deque(maxlen=buffer_size) self.count = 0 self.buffer_size = buffer_size def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s) def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ experience = (state, action, reward, done, new_state) if self.count < self.buffer_size: self.buffer.append(experience) self.count += 1 else: self.buffer.popleft() self.buffer.append(experience) def sample_batch(self, batch_size): if self.count < batch_size: batch = random.sample(self.buffer, self.count) else: batch = random.sample(self.buffer, batch_size) s_batch, a_batch, r_batch, d_batch, s2_batch = [], [], [], [], [] for s_, a_, r_, d_, s2_ in batch: s_batch.append(s_) s2_batch.append(s2_) a_batch.append(a_) r_batch.append(r_) d_batch.append(d_) s_batch = np.squeeze(np.array(s_batch), axis=1) s2_batch = np.squeeze(np.array(s2_batch), axis=1) r_batch = np.reshape(np.array(r_batch), (len(r_batch), 1)) a_batch = np.array(a_batch) d_batch = np.reshape(np.array(d_batch, dtype=int), (len(batch), 1)) return s_batch, a_batch, r_batch, d_batch, s2_batch # # Return a batch of experience # s_batch = np.array([i[0] for i in batch]) # a_batch = np.array([i[1] for i in batch]) # r_batch = np.array([i[2] for i in batch]) # d_batch = np.array([i[3] for i in batch]) # new_s_batch = np.array([i[4] for i in batch]) # return s_batch, a_batch, r_batch, d_batch, new_s_batch def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model(states) grads = self.critic.gradients(states, tfe.Variable(actions)) # Train actor self.actor.train(states, grads) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def format_state(self, state): ob_1 = np.reshape(state['observation'], (1, 10)) de_1 = np.reshape(state['desired_goal'], (1, 3)) return np.concatenate([ob_1, de_1], axis=1) def store_states(self, state, action, reward, done, info, new_state): # print(state['observation'].shape) ob_1 = np.reshape(state['observation'], (1, 10)) ac_1 = np.reshape(state['achieved_goal'], (1, 3)) de_1 = np.reshape(state['desired_goal'], (1, 3)) ob_2 = np.reshape(new_state['observation'], (1, 10)) s_1 = np.concatenate([ob_1, ac_1], axis=1) s2_1 = np.concatenate([ob_2, ac_1], axis=1) s_2 = np.concatenate([ob_1, de_1], axis=1) s2_2 = np.concatenate([ob_2, de_1], axis=1) substitute_goal = state['achieved_goal'].copy() substitute_reward = env.compute_reward(state['achieved_goal'], substitute_goal, info) e1 = (s_2, action, reward, done, s2_2) e2 = (s_1, action, substitute_reward, True, s2_1) if self.count + 2 < self.buffer_size: self.count += 2 else: self.buffer.popleft() self.buffer.popleft() self.buffer.append(e1) self.buffer.append(e2) def train(self, env, args): results = [] num_steps = 200 # First, gather experience tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit="episode") avg_r_ep = 0 best_avg = -float('inf') best_score = -float('inf') past_samples = 15 hist_ratio = deque(maxlen=past_samples) hist_scores = deque(maxlen=past_samples) for e in tqdm_e: noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.act_dim)) # Reset episode time, cumul_reward, done = 0, 0, False s = env.reset() # noise = OrnsteinUhlenbeckProcess(size=self.act_dim) for _ in range(num_steps): if args.render: env.render() # Actor picks an action (following the deterministic policy) old_state = self.format_state(s) # print(old_state.shape) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a + noise(), -self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal a = np.squeeze(a) new_state, r, done, info = env.step(a) dist = goal_distance(new_state['achieved_goal'], new_state['desired_goal']) # new_state = new_state['observation'] # Add outputs to memory buffer self.store_states(s, a, r, done, info, new_state) s = new_state cumul_reward += r # Sample experience from buffer states, actions, rewards, dones, new_states = self.sample_batch( args.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) # Update current state if done: break if avg_r_ep == 0: avg_r_ep = cumul_reward else: avg_r_ep = avg_r_ep * 0.99 + cumul_reward * 0.01 if avg_r_ep >= best_avg: best_avg = avg_r_ep self.actor.model.save_weights( 'pretrained/best_avg_ddpgActor.h5') self.critic.model.save_weights( 'pretrained/best_avg_ddpgCritic.h5') # Display score if cumul_reward >= best_score: best_score = cumul_reward self.actor.model.save_weights('pretrained/ddpgActor.h5') self.critic.model.save_weights('pretrained/ddpgCritic.h5') hist_ratio.append(int(dist <= 0.05)) hist_scores.append(cumul_reward) tqdm_e.set_description( "Score: {} | " "Best Reward: {} (avg: {:.2f})| " "Avg Reward, solve ratio over last {} samples: {:.3f}, {:.3f}". format(cumul_reward, np.amax(hist_scores), avg_r_ep, past_samples, np.mean(hist_scores), np.mean(hist_ratio))) tqdm_e.refresh() return results def eval(self, env, model_name='', random=False, render=False): if not random: self.actor.model.load_weights('pretrained/' + model_name + 'Actor.h5') self.critic.model.load_weights('pretrained/' + model_name + 'Critic.h5') score = 0 solve_count = 0 tr = tqdm(range(100)) avg_time = 0 for ep in tr: state = env.reset() for t in range(50): if render: env.render() if random: a = env.action_space.sample() else: a = self.policy_action(self.format_state(state))[0] state, r, done, info = env.step(a) d = goal_distance(state['achieved_goal'], state['desired_goal']) done = d <= 0.05 if done: solve_count += 1 break score += r tr.set_description("Solve percentage: {:.3f}".format(solve_count / (ep + 1))) avg_time += t print("average time to solve:", avg_time / 100.0) return score / 100.0