class AntAgent:

    def __init__(self, render=False, model=None):
        # create an environment
        self.environment = gym.make('MountainCarContinuous-v0')
        # reset environment when an agent is initialized
        self.current_observation = self.reset_environment()
        self.render = render
        self.model = model

        self.buffer = ReplayBuffer()

    def reset_environment(self):
        current_observation = self.environment.reset()
        return current_observation

    def get_action(self, current_observation):
        """Fetch an action according to model policy"""
        if self.model is None:
            action = self.environment.action_space.sample()
        else:
            action = self.model.predict(current_observation)
        return action

    def get_transitions(self, action):
        """Take one step in the environment and return the observations"""
        next_observation, reward, done, _ = self.environment.step(action)
        if self.render:
            self.environment.render()
        return next_observation, reward, done

    def run_episode(self, num_episodes=1):
        """run episodes `num_episodes` times using `model` policy"""
        for episode in range(num_episodes):
            self.current_observation = self.reset_environment()
            episode_id = self.buffer.create_episode()

            done = False
            transition = dict()

            while not done:
                transition['current_observation'] = self.current_observation
                transition['action'] = self.get_action(self.current_observation)
                transition['next_observation'], transition['reward'], done = self.get_transitions(transition['action'])

                self.buffer.add_sample(episode_id, transition)

            self.buffer.add_episode(episode_id)

    def learn(self, step=0, restore=False):
        """Train SAC model using transitions in replay buffer"""
        if self.model is None:
            raise Exception("This agent has no brain! Add a model which implements fit() function to train.")

        # Sample array of transitions from replay buffer.
        transition_matrices = self.buffer.fetch_sample()

        if step != 0:
            restore = True

        # Fit the SAC model.
        self.model.fit(transition_matrices, restore=restore, global_step=step)
Exemple #2
0
def run_for_config(config, agent_config, env_generator, is_in_collab=False):
    # set the name of the model
    model_name = config['general']['name']
    now = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y_%m_%d_%H_%M_%S')
    model_name = now + '_' + model_name if model_name is not None else now

    #random seed
    random_seed = config['general']['random_seed']
    np.random.seed(random_seed)
    random.seed(random_seed)
    tf.compat.v1.set_random_seed

    # where we save all the outputs
    working_dir = os.getcwd()
    if is_in_collab:
        working_dir = '/' + os.path.join('content', 'gdrive', 'My Drive',
                                         'colab_data', 'EvoDDPG')
    saver_dir = os.path.join(working_dir, 'models', model_name)
    if not os.path.exists(saver_dir):
        os.makedirs(saver_dir)
    config_copy_path = os.path.join(working_dir, 'models', model_name,
                                    'config.yml')
    summaries_dir = os.path.join(working_dir, 'tensorboard', model_name)

    #get enviroment constants
    state_dimension, action_dimension = env_generator.get_env_definitions()

    #constract population manager
    population_manager = PopulationManager(config, agent_config)

    # generate networks
    network_manager = NetworksManager(config, state_dimension,
                                      action_dimension, population_manager)

    # initialize replay memory
    replay_buffer = ReplayBuffer(config)

    # save model
    saver = tf.train.Saver(max_to_keep=4, save_relative_paths=saver_dir)
    yaml.dump(config, open(config_copy_path, 'w'))
    summaries_collector = SummariesCollector(summaries_dir, model_name)
    episode_runner = EpisodeRunner(config,
                                   env_generator.get_env_wrapper().get_env(),
                                   network_manager)
    #visualization_episode_runner = EpisodeRunner(
    #    config, env_generator.get_env_wrapper().get_env(), network_manager, is_in_collab=is_in_collab

    test_results = []

    def update_model(sess):
        batch_size = config['model']['batch_size']
        gamma = config['model']['gamma']
        current_state, action, reward, terminated, next_state = replay_buffer.sample_batch(
            batch_size)

        # get the predicted q value of the next state (action is taken from the target policy)
        #network_manager.runing_network == 1 - network_manager.runing_network
        next_state_action_target_q = network_manager.predict_policy_q(
            next_state, sess, use_online_network=False)

        # compute critic label
        q_label = {}

        one_hot_vector = []
        for i in range(batch_size):
            c = np.zeros(4)
            c[action[i]] = 1
            one_hot_vector.append(c)

        reward_batch = []
        for i in range(batch_size):
            d = np.zeros(4)
            d[action[i]] = reward[i]
            reward_batch = np.concatenate((reward_batch, d), axis=0)
        reward_batch = np.reshape(reward_batch, (batch_size, -1))

        terminated = [terminated, terminated, terminated, terminated]
        terminated = np.transpose(terminated)

        for network_id in next_state_action_target_q:
            q_label[network_id] = \
                np.expand_dims(np.array(reward_batch) +
                               np.multiply(
                                   np.multiply(1 - np.array(terminated), gamma),
                                   np.array(next_state_action_target_q[network_id])), 1)

        for network_id in next_state_action_target_q:

            q_label[network_id] = np.multiply(
                np.squeeze(np.array(q_label[network_id])),
                np.array(one_hot_vector))

        # train critic given the targets
        critic_optimization_summaries = network_manager.train_critics(
            current_state, q_label, sess)

        # update target networks
        # close because of double dqn
        network_manager.update_target_networks(sess)
        result = list(critic_optimization_summaries.values()
                      )  #+ list(actor_optimization_summaries.values())
        return result

    def compute_actor_score(episode_rewards, episode_lengths):
        return sum(episode_rewards)

    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=config['general']
        ['gpu_usage']))) as sess:
        sess.run(tf.global_variables_initializer())
        #close because of double dqn
        network_manager.update_target_networks(sess)

        global_step = 0
        total_episodes = 0
        for update_index in range(config['general']['updates_cycle_count']):

            episode_rewards, episode_lengths = [], []
            episodes_per_update = config['general']['episodes_per_update']
            actor_ids = network_manager.softmax_select_ids(episodes_per_update)
            total_rollout_time = None
            for actor_id in actor_ids:
                # run episode:
                states, actions, rewards, done, rollout_time = episode_runner.run_episode(
                    sess, actor_id, True)
                if total_rollout_time is None:
                    total_rollout_time = rollout_time
                else:
                    total_rollout_time += rollout_time
                # at the end of episode
                replay_buffer.add_episode(states, actions, rewards, done)
                total_episodes += 1

                episode_rewards.append(sum(rewards))
                episode_lengths.append(len(rewards))

            print('rollout time took: {}'.format(total_rollout_time))
            print(rewards[-1])

            # do updates
            if replay_buffer.size() > config['model']['batch_size']:
                a = datetime.datetime.now()
                for _ in range(config['general']['model_updates_per_cycle']):
                    summaries = update_model(sess)
                    if global_step % config['general'][
                            'write_train_summaries'] == 0:
                        summaries_collector.write_train_episode_summaries(
                            sess, global_step, episode_rewards,
                            episode_lengths)
                        summaries_collector.write_train_optimization_summaries(
                            summaries, global_step)
                    global_step += 1
                b = datetime.datetime.now()
                #print 'update took: {}'.format(b - a)

            # test if needed
            if update_index % config['test']['test_every_cycles'] == 0:
                # run test
                number_of_episodes_per_actor = config['test'][
                    'number_of_episodes_per_actor']
                actor_scores = {}
                actor_stats = {}
                for actor_id in network_manager.ids:
                    episode_rewards, episode_lengths = [], []
                    for _ in range(number_of_episodes_per_actor):
                        states, actions, rewards, done, rollout_time = episode_runner.run_episode(
                            sess, actor_id, False)
                        # at the end of episode
                        episode_rewards.append(sum(rewards))
                        episode_lengths.append(len(rewards))
                    actor_scores[actor_id] = compute_actor_score(
                        episode_rewards, episode_lengths)
                    actor_stats[actor_id] = (episode_rewards, episode_lengths)
                # update the scores
                network_manager.set_scores(actor_scores)
                # get the statistics of the best actor:
                best_actor_id = network_manager.get_best_scoring_actor_id()
                episode_rewards, episode_lengths = actor_stats[best_actor_id]
                summaries_collector.write_test_episode_summaries(
                    sess, global_step, episode_rewards, episode_lengths)
                # run visualization with the best actor
                #--------------need to change when we have population----------------------
                #visualization_episode_runner.run_episode(sess, best_actor_id, False, render=config['test']['show_best'])

            if update_index % config['general']['save_model_every_cycles'] == 0:
                saver.save(sess,
                           os.path.join(saver_dir, 'all_graph'),
                           global_step=global_step)
    return test_results
Exemple #3
0
class DeepQNetwork:
    def __init__(self, config, gym_wrapper):
        self.config = config
        self.gym_wrapper = gym_wrapper
        self.q_model = self._create_net()
        self.q_target_model = self._create_net()
        self.replay_buffer = ReplayBuffer(
            self.config['model']['replay_buffer_size'])

    def _create_net(self):
        activation = self.config['policy_network']['activation']
        model = keras.Sequential()
        for i, l in enumerate(self.config['policy_network']['layers']):
            if i == 0:
                state_size = self.gym_wrapper.get_state_size()
                model.add(
                    kl.Dense(128,
                             activation=activation,
                             input_shape=state_size))
            else:
                model.add(kl.Dense(128, activation=activation))
        model.add(kl.Dense(self.gym_wrapper.get_num_actions()))

        model.compile(
            optimizer=ko.Adam(lr=self.config['policy_network']['learn_rate']),
            loss=[self._get_mse_for_action])

        return model

    def _get_mse_for_action(self, target_and_action, current_prediction):
        targets, one_hot_action = tf.split(
            target_and_action, [1, self.gym_wrapper.get_num_actions()], axis=1)
        active_q_value = tf.expand_dims(tf.reduce_sum(current_prediction *
                                                      one_hot_action,
                                                      axis=1),
                                        axis=-1)
        return kls.mean_squared_error(targets, active_q_value)

    def _update_target(self):
        q_weights = self.q_model.get_weights()
        q_target_weights = self.q_target_model.get_weights()

        tau = self.config['policy_network']['tau']
        q_weights = [tau * w for w in q_weights]
        q_target_weights = [(1. - tau) * w for w in q_target_weights]
        new_weights = [
            q_weights[i] + q_target_weights[i] for i in range(len(q_weights))
        ]
        self.q_target_model.set_weights(new_weights)

    def _one_hot_action(self, actions):
        action_index = np.array(actions)
        batch_size = len(actions)
        result = np.zeros((batch_size, self.gym_wrapper.get_num_actions()))
        result[np.arange(batch_size), action_index] = 1.
        return result

    def train(self):
        env = self.gym_wrapper.get_env()
        completion_reward = self.config['general']['completion_reward']
        episode_collector = EpisodeCollector(
            self.q_model, env, self.gym_wrapper.get_num_actions())
        epsilon = self.config['model']['epsilon']
        for cycle in range(self.config['general']['cycles']):
            print('cycle {} epsilon {}'.format(cycle, epsilon))
            epsilon, train_avg_reward = self._train_cycle(
                episode_collector, epsilon)

            if (cycle + 1) % self.config['general']['test_frequency'] == 0 or (
                    completion_reward is not None
                    and train_avg_reward > completion_reward):
                test_avg_reward = self.test(False)

                if completion_reward is not None and test_avg_reward > completion_reward:
                    print(
                        'TEST avg reward {} > required reward {}... stopping training'
                        .format(test_avg_reward, completion_reward))
                    break
        env.close()

    def _train_cycle(self, episode_collector, epsilon):
        # collect data
        max_episode_steps = self.config['general']['max_train_episode_steps']
        rewards_per_episode = []
        for _ in range(self.config['general']['episodes_per_training_cycle']):
            states, actions, rewards, is_terminal_flags = episode_collector.collect_episode(
                max_episode_steps, epsilon=epsilon)
            self.replay_buffer.add_episode(states, actions, rewards,
                                           is_terminal_flags)
            rewards_per_episode.append(sum(rewards))

        avg_rewards = np.mean(rewards_per_episode)
        print('collected rewards: {}'.format(avg_rewards))
        epsilon *= self.config['model']['decrease_epsilon']
        epsilon = max(epsilon, self.config['model']['min_epsilon'])

        # train steps
        for _ in range(self.config['model']['train_steps_per_cycle']):
            self._train_step()

        # update target network
        self._update_target()
        return epsilon, avg_rewards

    def _train_step(self):
        batch_size = self.config['model']['batch_size']
        current_state, action, reward, next_state, is_terminal = zip(
            *self.replay_buffer.sample_batch(batch_size))
        next_q_values = self.q_target_model.predict(np.array(next_state))
        max_next_q_value = np.max(next_q_values, axis=-1)
        target_labels = np.array(
            reward) + (1. - np.array(is_terminal)) * max_next_q_value
        one_hot_actions = self._one_hot_action(action)
        target_and_actions = np.concatenate(
            (target_labels[:, None], one_hot_actions), axis=1)
        loss = self.q_model.train_on_batch(np.array(current_state),
                                           target_and_actions)

    def test(self, render=True, episodes=None):
        env = self.gym_wrapper.get_env()
        episode_collector = EpisodeCollector(
            self.q_model, env, self.gym_wrapper.get_num_actions())
        max_episode_steps = self.config['general']['max_test_episode_steps']
        rewards_per_episode = []
        if episodes is None:
            episodes = self.config['general']['episodes_per_test']
        for _ in range(episodes):
            rewards = episode_collector.collect_episode(max_episode_steps,
                                                        epsilon=0.,
                                                        render=render)[2]
            rewards_per_episode.append(sum(rewards))
        env.close()
        avg_reward = np.mean(rewards_per_episode)
        print('TEST collected rewards: {}'.format(avg_reward))
        return avg_reward
Exemple #4
0
def run_for_config(config, agent_config, env_generator, is_in_collab=False):
    # set the name of the model
    model_name = config['general']['name']
    now = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y_%m_%d_%H_%M_%S')
    model_name = now + '_' + model_name if model_name is not None else now

    #random seed
    random_seed = config['general']['random_seed']
    np.random.seed(random_seed)
    random.seed(random_seed)
    tf.compat.v1.set_random_seed

    # where we save all the outputs
    working_dir = os.getcwd()
    if is_in_collab:
        working_dir = '/' + os.path.join('content', 'gdrive', 'My Drive',
                                         'colab_data', 'dqn_evo')
    saver_dir = os.path.join(working_dir, 'models', model_name)
    if not os.path.exists(saver_dir):
        os.makedirs(saver_dir)
    config_copy_path = os.path.join(working_dir, 'models', model_name,
                                    'config.yml')
    summaries_dir = os.path.join(working_dir, 'tensorboard', model_name)

    #get enviroment constants
    state_dimension, action_dimension = env_generator.get_env_definitions()

    #constract population manager
    population_manager = PopulationManager(config, agent_config)

    # generate networks
    network_manager = NetworksManager(config, state_dimension,
                                      action_dimension, population_manager)

    # initialize replay memory
    replay_buffer = ReplayBuffer(config)

    # save model
    saver = tf.train.Saver(max_to_keep=4, save_relative_paths=saver_dir)
    yaml.dump(config, open(config_copy_path, 'w'))
    summaries_collector = SummariesCollector(summaries_dir, model_name, config)
    episode_runner = EpisodeRunner(config,
                                   env_generator.get_env_wrapper().get_env(),
                                   network_manager)
    visualization_episode_runner = EpisodeRunner(
        config,
        env_generator.get_env_wrapper().get_env(),
        network_manager,
        is_in_collab=is_in_collab)

    test_results = []

    def update_model(sess):
        batch_size = config['model']['batch_size']
        gamma = config['model']['gamma']
        population = config['evolution']['population']

        #sample a batch from replay buffer
        current_state, action, reward, terminated, next_state = replay_buffer.sample_batch(
            batch_size)
        #get predicted q values (shape of action dimantion)
        next_state_action_target_q = network_manager.predict_action(
            next_state, sess, use_online_network=False)

        #now we use bellman eq to update the network:

        #one hot vector - TODO: create at once, no for
        one_hot_vector = []
        one_hot_vector_bprop = []
        for i in range(batch_size):
            one_step_hot = np.zeros(action_dimension)
            one_step_hot[np.argmax(next_state_action_target_q[0][i])] = 1
            one_hot_vector.append(one_step_hot)

            one_step_hot_bprop = np.zeros(action_dimension)
            one_step_hot_bprop[action[i]] = 1
            one_hot_vector_bprop.append(one_step_hot_bprop)

        #reward
        reward_batch = []
        for i in range(batch_size):
            d = np.zeros(action_dimension)
            d[action[i]] = reward[i]
            reward_batch = np.concatenate((reward_batch, d), axis=0)

        reward_batch = np.reshape(reward_batch, (batch_size, -1))

        #terminated (end of game) todo:write like a normal person
        term = []
        for i in range(action_dimension):
            term.append(terminated)
        terminated = np.transpose(term)

        # compute critic label
        q_label = {}

        #just gamma * not end of game
        gamma_term = np.multiply(1 - np.array(terminated), gamma)

        #Bellman
        for network_id in range(population):
            q_label[network_id] = \
                np.expand_dims(np.array(reward_batch) +
                               np.multiply(gamma_term, np.array(next_state_action_target_q[network_id])), 1)

        for network_id in range(population):
            q_label[network_id] = np.multiply(
                np.squeeze(np.array(q_label[network_id])),
                np.array(one_hot_vector))
            #q_label[network_id] = np.sum(q_label[network_id],axis=1)

        critic_optimization_summaries = network_manager.train_critics(
            current_state, q_label, one_hot_vector_bprop, sess)

        # update target networks
        network_manager.update_target_networks(sess)
        result = list(critic_optimization_summaries.values()
                      )  #+ list(actor_optimization_summaries.values())
        return result

    def compute_actor_score(episode_rewards):
        return sum(episode_rewards)

    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=config['general']
        ['gpu_usage']))) as sess:
        sess.run(tf.global_variables_initializer())
        # CHECK IF NEEDED-- UPDATE TARGET NET TWICE (check with tom old code and ask him)
        #network_manager.update_target_networks(sess)

        global_step = 0
        for update_index in range(config['general']['updates_cycle_count']):
            episode_rewards, episode_lengths = [], []
            episodes_per_update = config['general']['episodes_per_update']
            actor_ids = network_manager.softmax_select_ids(episodes_per_update)
            total_rollout_time = None
            for actor_id in actor_ids:
                # run episode:
                states, actions, rewards, done, rollout_time = episode_runner.run_episode(
                    sess, actor_id, True)
                if total_rollout_time is None:
                    total_rollout_time = rollout_time
                else:
                    total_rollout_time += rollout_time
                # at the end of episode
                replay_buffer.add_episode(states, actions, rewards, done)
                episode_rewards.append(sum(rewards))
                episode_lengths.append(len(rewards))

            # do updates
            if replay_buffer.size() > config['model']['batch_size']:
                for _ in range(config['general']['model_updates_per_cycle']):
                    summaries = update_model(sess)
                    if global_step % config['general'][
                            'write_train_summaries'] == 0:
                        summaries_collector.write_train_episode_summaries(
                            sess, global_step, episode_rewards,
                            episode_lengths, actor_id)
                        summaries_collector.write_train_optimization_summaries(
                            summaries, global_step)
                    global_step += 1

            # test if needed
            if update_index % config['test']['test_every_cycles'] == 0:
                # run test
                number_of_episodes_per_actor = config['test'][
                    'number_of_episodes_per_actor']
                actor_scores = {}
                actor_stats = {}
                for actor_id in network_manager.ids:
                    episode_rewards, episode_lengths = [], []
                    for i in range(number_of_episodes_per_actor):
                        states, actions, rewards, done, rollout_time = episode_runner.run_episode(
                            sess, actor_id, False)
                        # at the end of episode
                        episode_rewards.append(sum(rewards))
                        episode_lengths.append(len(rewards))
                    actor_scores[actor_id] = compute_actor_score(
                        episode_rewards)
                    actor_stats[actor_id] = (episode_rewards, episode_lengths)
                # update the scores
                network_manager.set_scores(actor_scores)
                # get the statistics of the best actor:
                best_actor_id = network_manager.get_best_scoring_actor_id()
                episode_rewards, episode_lengths = actor_stats[best_actor_id]
                #if (np.argmax(episode_rewards) > 10):
                #    print ("agent won!!")
                summaries_collector.write_test_episode_summaries(
                    sess, global_step, episode_rewards, episode_lengths,
                    best_actor_id)
                # run visualization with the best actor
                visualization_episode_runner.run_episode(
                    sess,
                    best_actor_id,
                    False,
                    render=config['test']['show_best'])

            if update_index % config['general']['save_model_every_cycles'] == 0:
                saver.save(sess,
                           os.path.join(saver_dir, 'all_graph'),
                           global_step=global_step)
    return test_results