Beispiel #1
0
def main(max_timesteps, learning_rate):
    max_episodes = None
    #max_timesteps = 86400000000*days

    network_spec = [
        #dict(type='flatten'),
        dict(type='dense', size=11, activation='tanh'),
        #dict(type='dense', size=20, activation='tanh'),
        #dict(type='dense', size=32, activation='tanh'),
    ]

    exploration = dict(type='epsilon_decay', timesteps=max_timesteps)

    summarizer = dict(
        directory="./models/" + str(datetime.now()).replace(' ', ''),
        steps=10000,
        seconds=None,
        labels=[
            #'rewards',
            #'actions',
            'inputs',
            'gradients',
            'configuration',
        ],
        meta_dict=dict(
            description='July 2: Trying 11 node hidden layer.',
            layers=str(network_spec),
            timesteps=max_timesteps,
            exploration=exploration,
        ),
    )

    agent = DQNAgent(states=env.states,
                     actions=env.actions,
                     network=network_spec,
                     actions_exploration=exploration,
                     optimizer=dict(type='adam', learning_rate=learning_rate)
                     #summarizer=summarizer,
                     #batch_size=64
                     )

    runner = Runner(agent, env)

    report_episodes = 1

    #global prev
    global prev
    prev = 0

    def episode_finished(r):
        global prev
        if r.episode % report_episodes == 0:
            #print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep-prev))
            #print("Episode reward: {}".format(r.episode_rewards[-1]))
            print(r.episode_rewards[-1])
        prev = r.timestep
        #print("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    print("Starting {agent} for Environment '{env}'".format(agent=agent,
                                                            env=env))

    runner.run(num_episodes=max_episodes,
               num_timesteps=max_timesteps,
               max_episode_timesteps=None,
               episode_finished=episode_finished)

    agent.save_model(directory='./results/DeepQ/' +
                     str(datetime.now()).replace(' ', '') + '/model')

    runner.close()

    print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
Beispiel #2
0
class Reinforcement(ClientInterface.ClientInterface):
    def __init__(self,
                 name,
                 load_file=None,
                 is_stats=False,
                 file_stats=None,
                 train_adversary_level=2,
                 nb_batches=5000,
                 nb_games_per_batch=2,
                 layer_size=15,
                 nb_layers=3):
        """
        :param name: name of the IA/
        :param load_file: path and name of the model to load (without any extension).
        :param is_stats: boolean which tells whether the statistics are enabled.
        :param file_stats: name of the file where the statistics are written.
        :param train_adversary_level: integer indicating the AI to train against (corresponds to level in AICreator).
        :param nb_batches: number of batches. A batch is a group of successive games on which the ratio
        (nb_won_games / nb_games_per_batch) is computed and saved in score.txt.
        :param nb_games_per_batch: number of games per batch.
        :param layer_size: size of a neural network layer.
        :param nb_layers: number of layers in the neural network.
        """

        super().__init__(name, load_file)

        self.current_game_is_finish = None
        self.first_game = True

        # score
        self.score_self_old, self.score_self_new = 0, 0
        self.score_other_old, self.score_other_new = 0, 0
        self.file_scores = open('scores.txt', 'w')

        # AI parameters
        self.heuristics = [
            Heuristic.line_transition, Heuristic.column_transition,
            Heuristic.hidden_empty_cells, Heuristic.wells, Heuristic.holes,
            Heuristic.highest_column, Heuristic.columns_heights
        ]

        state = State.State()
        heuristics_sizes = [
            heuristic(state, state, None) for heuristic in self.heuristics
        ]
        self.nb_heuristics = len(flatten(heuristics_sizes))
        print('self.nb_heuristics', heuristics_sizes)
        self.train_adversary_level = train_adversary_level

        # iteration
        self.nb_batches = nb_batches
        self.nb_games_per_batch = nb_games_per_batch
        self.iteration = 0

        # neural network
        self.layer_size = layer_size
        self.nb_layers = nb_layers
        network_spec = [
            dict(type='dense', size=self.layer_size, activation='relu')
        ] * self.nb_layers

        self.agent = DQNAgent(states_spec={
            'shape': (self.nb_heuristics + NOMBRE_DE_PIECES, ),
            'type': 'float'
        },
                              actions_spec={
                                  'hor_move': {
                                      'type': 'int',
                                      'num_actions': 11
                                  },
                                  'rotate': {
                                      'type': 'int',
                                      'num_actions': 4
                                  },
                                  'choose': {
                                      'type': 'int',
                                      'num_actions': 3
                                  }
                              },
                              network_spec=network_spec)

        # loading of a saved model
        if load_file is not None:
            self.load(load_file)

        # stats
        self.is_stats = is_stats
        self.my_stats = None
        self.file_stats = file_stats
        self.pid_stats = None

    async def play(self, state):
        """
        Associates an action to a state. Called by the server.
        :param state: dictionary containing information about the game, send by the server.
        :return: action to apply.
        """

        # update all the scores (self.score_self_new, self.score_self_old, self.score_other_new, self.score_other_old)
        self.update_scores(state)

        # format the state to make it compatible with tensorforce
        state_formatted = self.format_state(state)

        if self.first_game:  # at the first game and first call to function play, no action has been performed yet ->
            # nothing to observe
            self.first_game = False
            self.agent.reset()
        else:
            # pass observation to the agent
            terminal = False
            reward = (self.score_self_new - self.score_self_old) - (
                self.score_other_new - self.score_other_old)
            self.agent.observe(terminal, reward)

        # select the action (exploitation or exploration)
        action = self.agent.act(state_formatted)

        # format the action to make it exploitable by the Tetris game
        action_to_apply = self.format_action(action, state)

        return action_to_apply
        # return {"hor_move": -2, "rotate": 1, "choose": state["pieces"][0]}

    def on_init_game(self, data):
        """
        Called at the beginning of a game.
        :param data: dictionary containing information about the game, send by the server.
        """

        print()
        print(self.iteration)

        self.my_id_in_game = data["ids_in_game"][0]

    def on_finished_game(self, data):
        """
        Called at the end on a game.
        :param data: dictionary containing information about the game, send by the server.
        """

        self.iteration += 1

        self.current_game_is_finish = True

        # update all the scores
        self.update_scores(data)

        # pass observation to the agent
        terminal = True
        reward = (self.score_self_new - self.score_self_old) - (
            self.score_other_new - self.score_other_old)
        self.agent.observe(terminal, reward)

    def update_scores(self, state):
        """
        Updates the scores of the agent and of the the other player.
        :param state: dictionary containing information about the game.
        """

        # update the old scores
        self.score_self_old, self.score_other_old = self.score_other_new, self.score_other_new

        # get the new scores
        self.score_self_new, self.score_other_new = self.format_score(state)

    @staticmethod
    def format_action(action, state):
        """
        Formats the action returned by tensorforce so that it can be used in the play function.
        :param action: action returned by tensorforce (function act).
        :param state: dictionary containing information about the game, send by the server.
        :return: dictionary containing the action.
        """

        # convert int32 (which is not serializable) to standard int
        action_to_apply = {key: int(value) for key, value in action.items()}

        action_to_apply['hor_move'] -= 5  # [0, 10] -> [-5, 5]
        action_to_apply['choose'] = state['pieces'][
            action_to_apply['choose']]  # index to letter

        return action_to_apply

    def evaluate_heuristics(self, heuristics, g_prec, g_next, action):
        """
        Computes the current values of the heuristic.
        :param heuristics: list containing the heuristic functions.
        :param g_prec: previous state.
        :param g_next: current state.
        :param action: action which allows to go from g_prec to g_next.
        :return: flat list containing the heuristics values (flattening is necessary because some heuristics are lists).
        """

        return flatten(
            [heuristic(g_prec, g_next, action) for heuristic in heuristics])

    def format_state(self, state):
        """
        Formats the state so that it can be used by tensorforce.
        :param state: dictionary containing information about the game, send by the server.
        :return: list containing the heuristics values. Represents the state.
        """

        state_bis = State.State(state['grid'])
        heuristics_values = self.evaluate_heuristics(self.heuristics, None,
                                                     state_bis, None)

        # selectable pieces as a one-shot vector
        pieces_one_hot = self.format_pieces(state['pieces'])

        # state used by tensorforce
        state_formatted = heuristics_values + pieces_one_hot

        print('{}, {}'.format(heuristics_values, pieces_one_hot))
        return state_formatted

    def format_pieces(self, pieces):
        """
        Formats the available pieces so that they can be used by tensorforce.
        :param pieces: 3-elements list containing letters representing pieces (no repetition).
        :return: 7-elements one-hot list containing 1 or 0.
        """

        pieces_formatted = [0] * NOMBRE_DE_PIECES

        for piece in pieces:
            pieces_formatted[self.char_to_int(piece)] = 1

        return pieces_formatted

    def format_score(self, state):
        """
        Extracts the score of the AI and of the other player.
        :param state: dictionary containing information about the game, send by the server.
        :return: score_self, score_other.
        """

        id_self = self.my_id_in_game
        id_other = (id_self + 1) % 2
        score_self = state['score'][id_self]
        score_other = state['score'][id_other]

        return score_self, score_other

    @staticmethod
    def char_to_int(char):
        """
        Converts a letter whose shape looks like a tetromino to a corresponding integer.
        :param char: 'O', 'I', 'L', 'T', 'S', 'Z' or 'J'.
        :return: integer from 0 to 6.
        """

        lu_table = {'O': 0, 'I': 1, 'L': 2, 'T': 3, 'S': 4, 'Z': 5, 'J': 6}
        return lu_table[char]

    async def train(self):
        """
        Triggers the training.
        """

        await super().init_train()
        if self.is_stats:
            self.my_stats = Stats.Stats()
            self.pid_stats = await self.my_stats.observe()

        for _ in range(self.nb_batches):
            wins = 0
            for _ in range(self.nb_games_per_batch):
                if self.is_stats:
                    await super().new_game(
                        players=[[self.my_client.pid, 1]],
                        ias=[[self.train_adversary_level, 1]],
                        viewers=[0, self.pid_stats])
                else:
                    await super().new_game(
                        players=[[self.my_client.pid, 1]],
                        ias=[[self.train_adversary_level, 1]],
                        viewers=[0])

                self.current_game_is_finish = False

                while not self.current_game_is_finish:
                    await asyncio.sleep(0)

                self.current_game_is_finish = False

                # increment wins when a game is won
                wins += 1 if self.score_self_new > self.score_other_new else 0

            # save the scores in a file
            self.file_scores.write('{}\n'.format(wins /
                                                 self.nb_games_per_batch))
            self.file_scores.flush()

        self.save()

    def save(self):
        """
        Saves the current model in directory rein_learn_models as 3 files.
        """

        #TODO: Dire si on a bien chargé
        # directory = os.path.join(os.getcwd(), 'rein_learn_models')
        time_str = time.strftime('%Y%m%d_%H%M%S')
        directory = os.path.join(os.getcwd(), 'rein_learn_models',
                                 'agent_' + time_str)
        checkpoint = self.agent.save_model(directory=directory,
                                           append_timestep=True)
        print('directory: {}'.format(directory))
        print('checkpoint: {}'.format(checkpoint))

    def load(self, load_file):
        """
        Loads a saved model.
        :param load_file: path and name of the model to load (without any extension).
        """

        # load_file represent the file path (without any extension)
        directory = os.path.dirname(load_file)
        file = os.path.basename(load_file)

        self.agent.restore_model(directory=directory, file=file)
Beispiel #3
0
    episode += 1
    total_timestep += timestep
    # avg_reward = float(episode_reward)/timestep
    successes.append(success)
    episode_rewards.append([episode_reward, timestep, success])

    # if total_timestep > 100000:
    #     print('{}th episode reward: {}'.format(episode, episode_reward))

    if episode % 100 == 0:
        f = open(record_dir + '/E2E_DQN_nav' + str(maze_id) + '.txt', 'a+')
        for i in episode_rewards:
            f.write(str(i))
            f.write('\n')
        f.close()
        episode_rewards = []
        agent.save_model('./models/')

    if len(successes) > 100:
        if sum(successes[-100:]) > 80:
            GazeboMaze.close()
            agent.save_model('./models/')
            f = open(record_dir + '/DQN_nav' + str(maze_id) + '.txt', 'a+')
            for i in episode_rewards:
                f.write(str(i))
                f.write('\n')
            f.close()
            print("Training End!")
            break
Beispiel #4
0
t = list()
rew = list()


def episode_finished(r):
    print(
        "Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
            ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1]))
    plt.plot(r.episode_rewards, 'r+')
    plt.pause(0.01)
    agent.save_model('forex_DQNAgent/')
    return True


# Start learning
runner.run(episodes=7000,
           max_episode_timesteps=(candles.candle_nums + 100),
           episode_finished=episode_finished)

agent.save_model('forex_agent_sma/')

# Print statistics
print(
    "Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}."
    .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:])))

print(env.pair_currency)
print(env.base_currency)

runner.close()