class AntAgent: def __init__(self, render=False, model=None): # create an environment self.environment = gym.make('MountainCarContinuous-v0') # reset environment when an agent is initialized self.current_observation = self.reset_environment() self.render = render self.model = model self.buffer = ReplayBuffer() def reset_environment(self): current_observation = self.environment.reset() return current_observation def get_action(self, current_observation): """Fetch an action according to model policy""" if self.model is None: action = self.environment.action_space.sample() else: action = self.model.predict(current_observation) return action def get_transitions(self, action): """Take one step in the environment and return the observations""" next_observation, reward, done, _ = self.environment.step(action) if self.render: self.environment.render() return next_observation, reward, done def run_episode(self, num_episodes=1): """run episodes `num_episodes` times using `model` policy""" for episode in range(num_episodes): self.current_observation = self.reset_environment() episode_id = self.buffer.create_episode() done = False transition = dict() while not done: transition['current_observation'] = self.current_observation transition['action'] = self.get_action(self.current_observation) transition['next_observation'], transition['reward'], done = self.get_transitions(transition['action']) self.buffer.add_sample(episode_id, transition) self.buffer.add_episode(episode_id) def learn(self, step=0, restore=False): """Train SAC model using transitions in replay buffer""" if self.model is None: raise Exception("This agent has no brain! Add a model which implements fit() function to train.") # Sample array of transitions from replay buffer. transition_matrices = self.buffer.fetch_sample() if step != 0: restore = True # Fit the SAC model. self.model.fit(transition_matrices, restore=restore, global_step=step)
def run_for_config(config, agent_config, env_generator, is_in_collab=False): # set the name of the model model_name = config['general']['name'] now = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S') model_name = now + '_' + model_name if model_name is not None else now #random seed random_seed = config['general']['random_seed'] np.random.seed(random_seed) random.seed(random_seed) tf.compat.v1.set_random_seed # where we save all the outputs working_dir = os.getcwd() if is_in_collab: working_dir = '/' + os.path.join('content', 'gdrive', 'My Drive', 'colab_data', 'EvoDDPG') saver_dir = os.path.join(working_dir, 'models', model_name) if not os.path.exists(saver_dir): os.makedirs(saver_dir) config_copy_path = os.path.join(working_dir, 'models', model_name, 'config.yml') summaries_dir = os.path.join(working_dir, 'tensorboard', model_name) #get enviroment constants state_dimension, action_dimension = env_generator.get_env_definitions() #constract population manager population_manager = PopulationManager(config, agent_config) # generate networks network_manager = NetworksManager(config, state_dimension, action_dimension, population_manager) # initialize replay memory replay_buffer = ReplayBuffer(config) # save model saver = tf.train.Saver(max_to_keep=4, save_relative_paths=saver_dir) yaml.dump(config, open(config_copy_path, 'w')) summaries_collector = SummariesCollector(summaries_dir, model_name) episode_runner = EpisodeRunner(config, env_generator.get_env_wrapper().get_env(), network_manager) #visualization_episode_runner = EpisodeRunner( # config, env_generator.get_env_wrapper().get_env(), network_manager, is_in_collab=is_in_collab test_results = [] def update_model(sess): batch_size = config['model']['batch_size'] gamma = config['model']['gamma'] current_state, action, reward, terminated, next_state = replay_buffer.sample_batch( batch_size) # get the predicted q value of the next state (action is taken from the target policy) #network_manager.runing_network == 1 - network_manager.runing_network next_state_action_target_q = network_manager.predict_policy_q( next_state, sess, use_online_network=False) # compute critic label q_label = {} one_hot_vector = [] for i in range(batch_size): c = np.zeros(4) c[action[i]] = 1 one_hot_vector.append(c) reward_batch = [] for i in range(batch_size): d = np.zeros(4) d[action[i]] = reward[i] reward_batch = np.concatenate((reward_batch, d), axis=0) reward_batch = np.reshape(reward_batch, (batch_size, -1)) terminated = [terminated, terminated, terminated, terminated] terminated = np.transpose(terminated) for network_id in next_state_action_target_q: q_label[network_id] = \ np.expand_dims(np.array(reward_batch) + np.multiply( np.multiply(1 - np.array(terminated), gamma), np.array(next_state_action_target_q[network_id])), 1) for network_id in next_state_action_target_q: q_label[network_id] = np.multiply( np.squeeze(np.array(q_label[network_id])), np.array(one_hot_vector)) # train critic given the targets critic_optimization_summaries = network_manager.train_critics( current_state, q_label, sess) # update target networks # close because of double dqn network_manager.update_target_networks(sess) result = list(critic_optimization_summaries.values() ) #+ list(actor_optimization_summaries.values()) return result def compute_actor_score(episode_rewards, episode_lengths): return sum(episode_rewards) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=config['general'] ['gpu_usage']))) as sess: sess.run(tf.global_variables_initializer()) #close because of double dqn network_manager.update_target_networks(sess) global_step = 0 total_episodes = 0 for update_index in range(config['general']['updates_cycle_count']): episode_rewards, episode_lengths = [], [] episodes_per_update = config['general']['episodes_per_update'] actor_ids = network_manager.softmax_select_ids(episodes_per_update) total_rollout_time = None for actor_id in actor_ids: # run episode: states, actions, rewards, done, rollout_time = episode_runner.run_episode( sess, actor_id, True) if total_rollout_time is None: total_rollout_time = rollout_time else: total_rollout_time += rollout_time # at the end of episode replay_buffer.add_episode(states, actions, rewards, done) total_episodes += 1 episode_rewards.append(sum(rewards)) episode_lengths.append(len(rewards)) print('rollout time took: {}'.format(total_rollout_time)) print(rewards[-1]) # do updates if replay_buffer.size() > config['model']['batch_size']: a = datetime.datetime.now() for _ in range(config['general']['model_updates_per_cycle']): summaries = update_model(sess) if global_step % config['general'][ 'write_train_summaries'] == 0: summaries_collector.write_train_episode_summaries( sess, global_step, episode_rewards, episode_lengths) summaries_collector.write_train_optimization_summaries( summaries, global_step) global_step += 1 b = datetime.datetime.now() #print 'update took: {}'.format(b - a) # test if needed if update_index % config['test']['test_every_cycles'] == 0: # run test number_of_episodes_per_actor = config['test'][ 'number_of_episodes_per_actor'] actor_scores = {} actor_stats = {} for actor_id in network_manager.ids: episode_rewards, episode_lengths = [], [] for _ in range(number_of_episodes_per_actor): states, actions, rewards, done, rollout_time = episode_runner.run_episode( sess, actor_id, False) # at the end of episode episode_rewards.append(sum(rewards)) episode_lengths.append(len(rewards)) actor_scores[actor_id] = compute_actor_score( episode_rewards, episode_lengths) actor_stats[actor_id] = (episode_rewards, episode_lengths) # update the scores network_manager.set_scores(actor_scores) # get the statistics of the best actor: best_actor_id = network_manager.get_best_scoring_actor_id() episode_rewards, episode_lengths = actor_stats[best_actor_id] summaries_collector.write_test_episode_summaries( sess, global_step, episode_rewards, episode_lengths) # run visualization with the best actor #--------------need to change when we have population---------------------- #visualization_episode_runner.run_episode(sess, best_actor_id, False, render=config['test']['show_best']) if update_index % config['general']['save_model_every_cycles'] == 0: saver.save(sess, os.path.join(saver_dir, 'all_graph'), global_step=global_step) return test_results
class DeepQNetwork: def __init__(self, config, gym_wrapper): self.config = config self.gym_wrapper = gym_wrapper self.q_model = self._create_net() self.q_target_model = self._create_net() self.replay_buffer = ReplayBuffer( self.config['model']['replay_buffer_size']) def _create_net(self): activation = self.config['policy_network']['activation'] model = keras.Sequential() for i, l in enumerate(self.config['policy_network']['layers']): if i == 0: state_size = self.gym_wrapper.get_state_size() model.add( kl.Dense(128, activation=activation, input_shape=state_size)) else: model.add(kl.Dense(128, activation=activation)) model.add(kl.Dense(self.gym_wrapper.get_num_actions())) model.compile( optimizer=ko.Adam(lr=self.config['policy_network']['learn_rate']), loss=[self._get_mse_for_action]) return model def _get_mse_for_action(self, target_and_action, current_prediction): targets, one_hot_action = tf.split( target_and_action, [1, self.gym_wrapper.get_num_actions()], axis=1) active_q_value = tf.expand_dims(tf.reduce_sum(current_prediction * one_hot_action, axis=1), axis=-1) return kls.mean_squared_error(targets, active_q_value) def _update_target(self): q_weights = self.q_model.get_weights() q_target_weights = self.q_target_model.get_weights() tau = self.config['policy_network']['tau'] q_weights = [tau * w for w in q_weights] q_target_weights = [(1. - tau) * w for w in q_target_weights] new_weights = [ q_weights[i] + q_target_weights[i] for i in range(len(q_weights)) ] self.q_target_model.set_weights(new_weights) def _one_hot_action(self, actions): action_index = np.array(actions) batch_size = len(actions) result = np.zeros((batch_size, self.gym_wrapper.get_num_actions())) result[np.arange(batch_size), action_index] = 1. return result def train(self): env = self.gym_wrapper.get_env() completion_reward = self.config['general']['completion_reward'] episode_collector = EpisodeCollector( self.q_model, env, self.gym_wrapper.get_num_actions()) epsilon = self.config['model']['epsilon'] for cycle in range(self.config['general']['cycles']): print('cycle {} epsilon {}'.format(cycle, epsilon)) epsilon, train_avg_reward = self._train_cycle( episode_collector, epsilon) if (cycle + 1) % self.config['general']['test_frequency'] == 0 or ( completion_reward is not None and train_avg_reward > completion_reward): test_avg_reward = self.test(False) if completion_reward is not None and test_avg_reward > completion_reward: print( 'TEST avg reward {} > required reward {}... stopping training' .format(test_avg_reward, completion_reward)) break env.close() def _train_cycle(self, episode_collector, epsilon): # collect data max_episode_steps = self.config['general']['max_train_episode_steps'] rewards_per_episode = [] for _ in range(self.config['general']['episodes_per_training_cycle']): states, actions, rewards, is_terminal_flags = episode_collector.collect_episode( max_episode_steps, epsilon=epsilon) self.replay_buffer.add_episode(states, actions, rewards, is_terminal_flags) rewards_per_episode.append(sum(rewards)) avg_rewards = np.mean(rewards_per_episode) print('collected rewards: {}'.format(avg_rewards)) epsilon *= self.config['model']['decrease_epsilon'] epsilon = max(epsilon, self.config['model']['min_epsilon']) # train steps for _ in range(self.config['model']['train_steps_per_cycle']): self._train_step() # update target network self._update_target() return epsilon, avg_rewards def _train_step(self): batch_size = self.config['model']['batch_size'] current_state, action, reward, next_state, is_terminal = zip( *self.replay_buffer.sample_batch(batch_size)) next_q_values = self.q_target_model.predict(np.array(next_state)) max_next_q_value = np.max(next_q_values, axis=-1) target_labels = np.array( reward) + (1. - np.array(is_terminal)) * max_next_q_value one_hot_actions = self._one_hot_action(action) target_and_actions = np.concatenate( (target_labels[:, None], one_hot_actions), axis=1) loss = self.q_model.train_on_batch(np.array(current_state), target_and_actions) def test(self, render=True, episodes=None): env = self.gym_wrapper.get_env() episode_collector = EpisodeCollector( self.q_model, env, self.gym_wrapper.get_num_actions()) max_episode_steps = self.config['general']['max_test_episode_steps'] rewards_per_episode = [] if episodes is None: episodes = self.config['general']['episodes_per_test'] for _ in range(episodes): rewards = episode_collector.collect_episode(max_episode_steps, epsilon=0., render=render)[2] rewards_per_episode.append(sum(rewards)) env.close() avg_reward = np.mean(rewards_per_episode) print('TEST collected rewards: {}'.format(avg_reward)) return avg_reward
def run_for_config(config, agent_config, env_generator, is_in_collab=False): # set the name of the model model_name = config['general']['name'] now = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S') model_name = now + '_' + model_name if model_name is not None else now #random seed random_seed = config['general']['random_seed'] np.random.seed(random_seed) random.seed(random_seed) tf.compat.v1.set_random_seed # where we save all the outputs working_dir = os.getcwd() if is_in_collab: working_dir = '/' + os.path.join('content', 'gdrive', 'My Drive', 'colab_data', 'dqn_evo') saver_dir = os.path.join(working_dir, 'models', model_name) if not os.path.exists(saver_dir): os.makedirs(saver_dir) config_copy_path = os.path.join(working_dir, 'models', model_name, 'config.yml') summaries_dir = os.path.join(working_dir, 'tensorboard', model_name) #get enviroment constants state_dimension, action_dimension = env_generator.get_env_definitions() #constract population manager population_manager = PopulationManager(config, agent_config) # generate networks network_manager = NetworksManager(config, state_dimension, action_dimension, population_manager) # initialize replay memory replay_buffer = ReplayBuffer(config) # save model saver = tf.train.Saver(max_to_keep=4, save_relative_paths=saver_dir) yaml.dump(config, open(config_copy_path, 'w')) summaries_collector = SummariesCollector(summaries_dir, model_name, config) episode_runner = EpisodeRunner(config, env_generator.get_env_wrapper().get_env(), network_manager) visualization_episode_runner = EpisodeRunner( config, env_generator.get_env_wrapper().get_env(), network_manager, is_in_collab=is_in_collab) test_results = [] def update_model(sess): batch_size = config['model']['batch_size'] gamma = config['model']['gamma'] population = config['evolution']['population'] #sample a batch from replay buffer current_state, action, reward, terminated, next_state = replay_buffer.sample_batch( batch_size) #get predicted q values (shape of action dimantion) next_state_action_target_q = network_manager.predict_action( next_state, sess, use_online_network=False) #now we use bellman eq to update the network: #one hot vector - TODO: create at once, no for one_hot_vector = [] one_hot_vector_bprop = [] for i in range(batch_size): one_step_hot = np.zeros(action_dimension) one_step_hot[np.argmax(next_state_action_target_q[0][i])] = 1 one_hot_vector.append(one_step_hot) one_step_hot_bprop = np.zeros(action_dimension) one_step_hot_bprop[action[i]] = 1 one_hot_vector_bprop.append(one_step_hot_bprop) #reward reward_batch = [] for i in range(batch_size): d = np.zeros(action_dimension) d[action[i]] = reward[i] reward_batch = np.concatenate((reward_batch, d), axis=0) reward_batch = np.reshape(reward_batch, (batch_size, -1)) #terminated (end of game) todo:write like a normal person term = [] for i in range(action_dimension): term.append(terminated) terminated = np.transpose(term) # compute critic label q_label = {} #just gamma * not end of game gamma_term = np.multiply(1 - np.array(terminated), gamma) #Bellman for network_id in range(population): q_label[network_id] = \ np.expand_dims(np.array(reward_batch) + np.multiply(gamma_term, np.array(next_state_action_target_q[network_id])), 1) for network_id in range(population): q_label[network_id] = np.multiply( np.squeeze(np.array(q_label[network_id])), np.array(one_hot_vector)) #q_label[network_id] = np.sum(q_label[network_id],axis=1) critic_optimization_summaries = network_manager.train_critics( current_state, q_label, one_hot_vector_bprop, sess) # update target networks network_manager.update_target_networks(sess) result = list(critic_optimization_summaries.values() ) #+ list(actor_optimization_summaries.values()) return result def compute_actor_score(episode_rewards): return sum(episode_rewards) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=config['general'] ['gpu_usage']))) as sess: sess.run(tf.global_variables_initializer()) # CHECK IF NEEDED-- UPDATE TARGET NET TWICE (check with tom old code and ask him) #network_manager.update_target_networks(sess) global_step = 0 for update_index in range(config['general']['updates_cycle_count']): episode_rewards, episode_lengths = [], [] episodes_per_update = config['general']['episodes_per_update'] actor_ids = network_manager.softmax_select_ids(episodes_per_update) total_rollout_time = None for actor_id in actor_ids: # run episode: states, actions, rewards, done, rollout_time = episode_runner.run_episode( sess, actor_id, True) if total_rollout_time is None: total_rollout_time = rollout_time else: total_rollout_time += rollout_time # at the end of episode replay_buffer.add_episode(states, actions, rewards, done) episode_rewards.append(sum(rewards)) episode_lengths.append(len(rewards)) # do updates if replay_buffer.size() > config['model']['batch_size']: for _ in range(config['general']['model_updates_per_cycle']): summaries = update_model(sess) if global_step % config['general'][ 'write_train_summaries'] == 0: summaries_collector.write_train_episode_summaries( sess, global_step, episode_rewards, episode_lengths, actor_id) summaries_collector.write_train_optimization_summaries( summaries, global_step) global_step += 1 # test if needed if update_index % config['test']['test_every_cycles'] == 0: # run test number_of_episodes_per_actor = config['test'][ 'number_of_episodes_per_actor'] actor_scores = {} actor_stats = {} for actor_id in network_manager.ids: episode_rewards, episode_lengths = [], [] for i in range(number_of_episodes_per_actor): states, actions, rewards, done, rollout_time = episode_runner.run_episode( sess, actor_id, False) # at the end of episode episode_rewards.append(sum(rewards)) episode_lengths.append(len(rewards)) actor_scores[actor_id] = compute_actor_score( episode_rewards) actor_stats[actor_id] = (episode_rewards, episode_lengths) # update the scores network_manager.set_scores(actor_scores) # get the statistics of the best actor: best_actor_id = network_manager.get_best_scoring_actor_id() episode_rewards, episode_lengths = actor_stats[best_actor_id] #if (np.argmax(episode_rewards) > 10): # print ("agent won!!") summaries_collector.write_test_episode_summaries( sess, global_step, episode_rewards, episode_lengths, best_actor_id) # run visualization with the best actor visualization_episode_runner.run_episode( sess, best_actor_id, False, render=config['test']['show_best']) if update_index % config['general']['save_model_every_cycles'] == 0: saver.save(sess, os.path.join(saver_dir, 'all_graph'), global_step=global_step) return test_results