def evolve(self, population, save=True):
        """
        Evolve agents

        :param population:
        :type population:
        :param save: save agents weights and scores
        :type save: bool
        :return:
        """

        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        print("Optimization - started", timestamp)

        agents = population.create_population()

        for i in range(population.max_generations):
            if not self.terminate(population, i):
                try:
                    population.agents_weights[i] = np.array(
                        [a.model.get_weights() for a in agents],
                        dtype=np.ndarray)

                    for j, agent in enumerate(agents):  # TODO parallelize
                        score = agent.run_agent()
                        population.scores[i][j] = score

                    print_scores(i + 1, population.scores[i])

                    if save and (i + 1) % 50 == 0:
                        save_results(population.agents_weights[:i],
                                     population.scores[:i], timestamp)

                    if i < population.max_generations - 1:
                        self.generate_next_generation(population=population,
                                                      generation=i)

                        for k, a in enumerate(agents):
                            agents[k].model.set_weights(
                                population.agents_weights[i + 1][k])

                except KeyboardInterrupt:
                    LOGGER.log(environment=ENVIRONMENT.name,
                               timestamp=timestamp,
                               algorithm=self.__class__.__name__,
                               parameters=vars(self),
                               generations=i,
                               score=np.max(population.scores[i - 1]))
                    save_results(population.agents_weights[:i - 1],
                                 population.scores[:i - 1], timestamp)
                    sys.exit()

            else:
                population.agents_weights = population.agents_weights[:i]
                population.scores = population.scores[:i]
                break

        if save:
            LOGGER.log(environment=ENVIRONMENT.name,
                       timestamp=timestamp,
                       algorithm=self.__class__.__name__,
                       parameters=vars(self),
                       generations=i,
                       score=np.max(population.scores[i]))
            save_results(population.agents_weights, population.scores,
                         timestamp)

        return population.agents_weights, population.scores
Exemple #2
0
    def run(self, train=True):
        """
        Run DDPG agent

        :param train: train indicator
        :type train: bool
        :return:
        """

        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

        step = 0

        self.load_weights()

        try:

            for i in range(self.n_episodes):

                # get initial state
                state = self.environment.env.reset()

                total_reward = 0

                for j in range(self.environment.max_time):
                    # loss = 0

                    self.environment.env.render()

                    action = self.actor.model.predict(
                        state.reshape(1, state.shape[0]))
                    action = self.noise.get_noisy_action(action, j)

                    new_state, reward, done, info = self.environment.env.step(
                        action[0])
                    # td_error = reward + self.gamma * 1 - 1

                    self.buffer.add(state, action[0], reward, new_state, done)

                    s, a, r, new_s, d = self.buffer.get_batch(self.batch_size)

                    target_q = self.critic.target_model.predict(
                        [new_s, self.actor.target_model.predict(new_s)])

                    y = r
                    for k in range(len(d)):
                        # if d[k]:
                        #     y[k] = r[k]
                        # else:
                        #     y[k] = r[k] + self.gamma * target_q[k]
                        y[k] = r[k] + self.gamma * target_q[k]

                    if train:
                        # loss += self.critic.model.train_on_batch([s, a], y)
                        self.critic.model.train_on_batch([s, a], y)
                        a_grads = self.actor.model.predict(s)
                        grads = self.critic.gradients(s, a_grads)
                        self.actor.train(s, grads)

                        self.actor.update_target()
                        self.critic.update_target()

                    total_reward += reward

                    if i + 1 % 200 == 0:
                        LOGGER.log(environment=self.environment.name,
                                   timestamp=timestamp,
                                   algorithm=self.__class__.__name__,
                                   parameters=self.get_params(),
                                   total_steps=i,
                                   score=total_reward)
                        self.save_weights()

                    if done:  # or np.array_equal(np.around(new_state, 3), np.around(state, 3)):
                        previous_reward = total_reward
                        break

                    state = new_state

                    # print("Episode", i, "Step", step, "Action", action, "Reward", reward, "Loss", loss)

                    step += 1

                print(
                    "Episode: {:<5d}  Total Reward: {:<+10.3f}  Total Steps: {:<10d} "
                    " Replay Buffer size: {}".format(
                        i, total_reward, step, self.buffer.n_experiences))

        except KeyboardInterrupt:
            print("Training interrupted.")

        print("Saving weights...")
        self.save_weights()
Exemple #3
0
    def train(self):
        """
        Train BipedaWaker-v2 agent

        :return:
        """

        # get start timestamp
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

        # saving paths
        model_path = os.path.join(td3_cfg.models_path,
                                  '{}-{}'.format(env_cfg.name,
                                                 timestamp), 'model.ckpt')
        results_path = os.path.join(td3_cfg.models_path,
                                    '{}-{}'.format(env_cfg.name,
                                                   timestamp), 'results.npy')
        distances_path = os.path.join(td3_cfg.models_path,
                                      '{}-{}'.format(env_cfg.name, timestamp),
                                      'distances.npy')
        weights_path = os.path.join(td3_cfg.models_path,
                                    '{}-{}'.format(env_cfg.name, timestamp),
                                    'weights_init_fin.npy')
        video_dir = os.path.join(td3_cfg.models_path,
                                 '{}-{}'.format(env_cfg.name,
                                                timestamp), 'video')

        # read environment information from config
        env = env_cfg.env

        # record video every 100 episodes
        if td3_cfg.record_videos:
            env = wrappers.Monitor(
                env,
                video_dir,
                video_callable=lambda ep: ep % td3_cfg.record_videos == 0)

        # arrays with rewards, distances for later optimality analysis
        rewards = []
        distances_consecutive = np.zeros(2, dtype=np.ndarray)
        distances_init = np.zeros(2, dtype=np.ndarray)

        with tf.Session() as sess:

            # initialization

            self.agent = TD3(sess)
            saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            sess.run(init)

            self.agent.initialize()
            global_step = 0

            weights_init = get_actor_weights(sess)
            weights_old = weights_init

            try:

                for i in range(td3_cfg.n_episodes):

                    s = env.reset()  # get initial state
                    ep_reward = 0
                    ep_steps = 0
                    noises = []
                    actions = []
                    done = False

                    while not done:

                        env.render()

                        if ep_steps < 10:
                            action = self.agent.get_random_action()
                        else:
                            action, action_org, noise = self.agent.get_noisy_action(
                                s)
                            noises.append(noise)
                            actions.append(action_org)
                        action = action.squeeze()

                        s2, r, done, info = env.step(action.tolist())
                        ep_reward += r
                        ep_steps += 1
                        global_step += 1

                        # store transition in replay buffer
                        self.agent.store_experience(s, action, r, done, s2)

                        # use symmetry of leg 1 and leg 2
                        mirrored_s = mirror_state(s)
                        mirrored_s2 = mirror_state(s2)
                        mirrored_a = mirror_action(action)
                        self.agent.store_experience(mirrored_s, mirrored_a, r,
                                                    done, mirrored_s2)

                        # train agent
                        temp = self.agent.train(global_step)
                        if temp:
                            weights = temp

                        s = s2

                        if done:
                            # end of the episode
                            count = i + 1

                            # get trained weights
                            weights = get_actor_weights(sess)
                            for iw, w in enumerate(weights):
                                # compute the distances with the previous weights and the initial weights
                                con, init = compute_distance_episodes(
                                    weights_init[iw], weights_old[iw],
                                    weights[iw])
                                distances_consecutive[iw] = np.append(
                                    distances_consecutive[iw], con)
                                distances_init[iw] = np.append(
                                    distances_init[iw], init)
                            weights_old = weights

                            # evaluation
                            if count % td3_cfg.test_every == 0:
                                eval_ep_reward, eval_ep_steps = self.evaluate(
                                    env)
                                print(
                                    "Episode: {:<10d} Evaluation Reward: {:<+10.3f}  "
                                    "Total Training Steps: {:10d}".format(
                                        count, eval_ep_reward, global_step))
                                rewards.append(eval_ep_reward)

                            # saving
                            if count % td3_cfg.save_every == 0:
                                saver.save(sess, model_path, global_step=count)
                                np.save(results_path, rewards)
                                np.save(
                                    distances_path,
                                    np.vstack((distances_consecutive,
                                               distances_init)))
                                np.save(weights_path,
                                        np.append(weights_init, weights))
            except KeyboardInterrupt:
                print("Training interrupted.")

            # Finalize training and save results
            print('Total steps:', global_step)
            print("Saving results...")
            LOGGER.log(environment=env_cfg.name,
                       timestamp=timestamp,
                       algorithm=self.agent.__class__.__name__,
                       parameters=vars(td3_cfg),
                       total_steps=global_step,
                       score=eval_ep_reward)
            saver.save(sess, model_path, global_step=count)
            np.save(results_path, rewards)
            np.save(distances_path,
                    np.vstack((distances_consecutive, distances_init)))
            np.save(weights_path, np.append(weights_init, weights))

        env.close()