Ejemplo n.º 1
0
def main():
    env = OpenAIGym("P3DX-v0")

    agent = DQNAgent(states=dict(type='float', shape=(80, 80, 4)),
                     actions=dict(type='int', num_actions=7),
                     network=[
                         dict(type="conv2d",
                              size=16,
                              window=[8, 8],
                              stride=4,
                              activation="relu"),
                         dict(type="conv2d",
                              size=32,
                              window=[4, 4],
                              stride=2,
                              activation="relu"),
                         dict(type="flatten"),
                         dict(type="dense", size=256)
                     ],
                     actions_exploration=dict(type="epsilon_decay",
                                              initial_epsilon=1.0,
                                              final_epsilon=0.1,
                                              timesteps=1000),
                     memory=dict(type="replay",
                                 capacity=1000,
                                 include_next_states=True),
                     update_mode=dict(unit="timesteps",
                                      batch_size=16,
                                      frequency=4),
                     discount=0.99,
                     entropy_regularization=None,
                     double_q_model=True,
                     optimizer=dict(type="adam", learning_rate=1e-4))

    try:
        agent.restore_model(directory="modelo/", file="data-129235")
        print("Found data!")
    except Exception as e:
        print(e)
        print("Can't load data")

    print("Starting execution")
    state = env.reset()
    agent.reset()
    try:
        while True:
            # Get action - no exploration and no observing
            action = agent.act(state, deterministic=True, independent=True)
            print(action)

            # Execute action in the environment
            state, terminal_state, reward = env.execute(action)

            if terminal_state:
                raise KeyboardInterrupt
    except KeyboardInterrupt:
        print("Terminal state", terminal_state)
        state = env.reset()
        agent.reset()
Ejemplo n.º 2
0
    )

    agent = DQNAgent(**dqn)
    #agent = PPOAgent(**ppo)
    statistics = {}
    actions = [0 for x in range(16)]
    s = time.time()
    skip_steps = 8
    g.flip_player()
    for i in range(100000):
        state = g.reset()

        while not g.is_terminal():
            state = cv2.resize(state, (80, 80))
            # Perform Action
            action = agent.act(state)
            actions[action] += 1
            _, r, t, _ = g.step(action)


            # Add experience, agent automatically updates model according to batch size
            agent.observe(reward=r, terminal=t)
            g.flip_player()
            a2 = random.randint(0, g.get_action_space() - 1)
            _, _, t2, _ = g.step(a2)
            g.flip_player()

            for _ in range(skip_steps):
                g.update()
            # Re render and get state
            s1 = g.get_state()
    def test_blogpost_introduction_runner(self):
        from tensorforce.environments.minimal_test import MinimalTest
        from tensorforce.agents import DQNAgent
        from tensorforce.execution import Runner

        environment = MinimalTest(specification=[('int', ())])

        network_spec = [
            dict(type='dense', size=32)
        ]

        agent = DQNAgent(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50
        )
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(runner):
            if runner.episode % 100 == 0:
                print(sum(runner.episode_rewards[-100:]) / 100)
            return runner.episode < 100 \
                or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:])

        # runner.run(episodes=1000, episode_finished=episode_finished)
        runner.run(episodes=10, episode_finished=episode_finished)  # Only 10 episodes for this test

        ### Code block: next
        agent = DQNAgent(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50
        )

        # max_episodes = 1000
        max_episodes = 10  # Only 10 episodes for this test
        max_timesteps = 2000

        episode = 0
        episode_rewards = list()

        while True:
            state = environment.reset()
            agent.reset()

            timestep = 0
            episode_reward = 0
            while True:
                action = agent.act(states=state)
                state, terminal, reward = environment.execute(actions=action)
                agent.observe(terminal=terminal, reward=reward)

                timestep += 1
                episode_reward += reward

                if terminal or timestep == max_timesteps:
                    break

            episode += 1
            episode_rewards.append(episode_reward)

            if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes:
                break

        agent.close()
        environment.close()
Ejemplo n.º 4
0
elif (num_episodes > 1):
    step_plot = 0
    step_print = 0
    episode_plot = 1
    episode_print = 1
avgreward_per_episode = []
profits_per_episode = []
avg_ep_reward = 0
for i in range(num_episodes):
    agent.reset()
    states = notebook.reset()
    terminal = False
    step_reward = []
    print("Episode: " + str(i))
    while not terminal:
        action = agent.act(states=states)
        position = notebook.position
        states, terminal, reward = notebook.execute(actions=action)
        agent.observe(reward=reward, terminal=terminal)
        if (step_print):
            print("TS: " + str(notebook.time_step) + " action: " +
                  str(action) + " position: " + str(position) + " reward: " +
                  str(reward) + " profit: " + str(notebook.profit) +
                  " curr_price: " + str(notebook.curr_price) + " curr_cash: " +
                  str(notebook.curr_cash))
        step_reward.append(reward)
    final_profit = notebook.curr_cash - notebook.starting_cash
    if (episode_print):
        print(" FinalCash: " + str(notebook.curr_cash) + " Profit: " +
              str(notebook.curr_cash - notebook.starting_cash) +
              " MeanReward: " + str(np.array(step_reward).mean()))
Ejemplo n.º 5
0
        action = agent_ppo.act(state)
        action = action.values()

        #print("ai", action)
        reward = infrastructure.shutdown(action)

        if infrastructure.attempts < infrastructure.peers:
            agent_ppo.observe(reward=reward, terminal=False)
        else:
            agent_ppo.observe(reward=reward, terminal=True)

        rl_ppo.append(reward)

        #dqn agent
        action = agent_dqn.act(state)
        action = action.values()

        reward = infrastructure.monkey(action)

        if infrastructure.attempts < infrastructure.peers:
            agent_dqn.observe(reward=reward, terminal=False)
        else:
            agent_dqn.observe(reward=reward, terminal=True)

        rl_dqn.append(reward)

        #trpo agent
        action = agent_vpg.act(state)
        action = action.values()
Ejemplo n.º 6
0
        counter = 0

        while True:
            counter += 1

            # Render the game state
            g.render()
            g.view()
            g.caption()
            g.gui.capture(save=True)

            # Get and preprocess the state
            s1_image = g.get_state(True)
            s1_image = s1_image.reshape((1, 672, 672, 3))
            action = agent.act({"image": s1_image})

            print(s1_image.shape)
            player1.do_action(action)
            player2.do_action(np.random.randint(0, 16))

            # Process game
            for _ in range(action_frequency):
                g.tick()
                g.update()

            r = -0.01
            t = False

            if g.is_terminal():
                g.reset()
Ejemplo n.º 7
0
class Reinforcement(ClientInterface.ClientInterface):
    def __init__(self,
                 name,
                 load_file=None,
                 is_stats=False,
                 file_stats=None,
                 train_adversary_level=2,
                 nb_batches=5000,
                 nb_games_per_batch=2,
                 layer_size=15,
                 nb_layers=3):
        """
        :param name: name of the IA/
        :param load_file: path and name of the model to load (without any extension).
        :param is_stats: boolean which tells whether the statistics are enabled.
        :param file_stats: name of the file where the statistics are written.
        :param train_adversary_level: integer indicating the AI to train against (corresponds to level in AICreator).
        :param nb_batches: number of batches. A batch is a group of successive games on which the ratio
        (nb_won_games / nb_games_per_batch) is computed and saved in score.txt.
        :param nb_games_per_batch: number of games per batch.
        :param layer_size: size of a neural network layer.
        :param nb_layers: number of layers in the neural network.
        """

        super().__init__(name, load_file)

        self.current_game_is_finish = None
        self.first_game = True

        # score
        self.score_self_old, self.score_self_new = 0, 0
        self.score_other_old, self.score_other_new = 0, 0
        self.file_scores = open('scores.txt', 'w')

        # AI parameters
        self.heuristics = [
            Heuristic.line_transition, Heuristic.column_transition,
            Heuristic.hidden_empty_cells, Heuristic.wells, Heuristic.holes,
            Heuristic.highest_column, Heuristic.columns_heights
        ]

        state = State.State()
        heuristics_sizes = [
            heuristic(state, state, None) for heuristic in self.heuristics
        ]
        self.nb_heuristics = len(flatten(heuristics_sizes))
        print('self.nb_heuristics', heuristics_sizes)
        self.train_adversary_level = train_adversary_level

        # iteration
        self.nb_batches = nb_batches
        self.nb_games_per_batch = nb_games_per_batch
        self.iteration = 0

        # neural network
        self.layer_size = layer_size
        self.nb_layers = nb_layers
        network_spec = [
            dict(type='dense', size=self.layer_size, activation='relu')
        ] * self.nb_layers

        self.agent = DQNAgent(states_spec={
            'shape': (self.nb_heuristics + NOMBRE_DE_PIECES, ),
            'type': 'float'
        },
                              actions_spec={
                                  'hor_move': {
                                      'type': 'int',
                                      'num_actions': 11
                                  },
                                  'rotate': {
                                      'type': 'int',
                                      'num_actions': 4
                                  },
                                  'choose': {
                                      'type': 'int',
                                      'num_actions': 3
                                  }
                              },
                              network_spec=network_spec)

        # loading of a saved model
        if load_file is not None:
            self.load(load_file)

        # stats
        self.is_stats = is_stats
        self.my_stats = None
        self.file_stats = file_stats
        self.pid_stats = None

    async def play(self, state):
        """
        Associates an action to a state. Called by the server.
        :param state: dictionary containing information about the game, send by the server.
        :return: action to apply.
        """

        # update all the scores (self.score_self_new, self.score_self_old, self.score_other_new, self.score_other_old)
        self.update_scores(state)

        # format the state to make it compatible with tensorforce
        state_formatted = self.format_state(state)

        if self.first_game:  # at the first game and first call to function play, no action has been performed yet ->
            # nothing to observe
            self.first_game = False
            self.agent.reset()
        else:
            # pass observation to the agent
            terminal = False
            reward = (self.score_self_new - self.score_self_old) - (
                self.score_other_new - self.score_other_old)
            self.agent.observe(terminal, reward)

        # select the action (exploitation or exploration)
        action = self.agent.act(state_formatted)

        # format the action to make it exploitable by the Tetris game
        action_to_apply = self.format_action(action, state)

        return action_to_apply
        # return {"hor_move": -2, "rotate": 1, "choose": state["pieces"][0]}

    def on_init_game(self, data):
        """
        Called at the beginning of a game.
        :param data: dictionary containing information about the game, send by the server.
        """

        print()
        print(self.iteration)

        self.my_id_in_game = data["ids_in_game"][0]

    def on_finished_game(self, data):
        """
        Called at the end on a game.
        :param data: dictionary containing information about the game, send by the server.
        """

        self.iteration += 1

        self.current_game_is_finish = True

        # update all the scores
        self.update_scores(data)

        # pass observation to the agent
        terminal = True
        reward = (self.score_self_new - self.score_self_old) - (
            self.score_other_new - self.score_other_old)
        self.agent.observe(terminal, reward)

    def update_scores(self, state):
        """
        Updates the scores of the agent and of the the other player.
        :param state: dictionary containing information about the game.
        """

        # update the old scores
        self.score_self_old, self.score_other_old = self.score_other_new, self.score_other_new

        # get the new scores
        self.score_self_new, self.score_other_new = self.format_score(state)

    @staticmethod
    def format_action(action, state):
        """
        Formats the action returned by tensorforce so that it can be used in the play function.
        :param action: action returned by tensorforce (function act).
        :param state: dictionary containing information about the game, send by the server.
        :return: dictionary containing the action.
        """

        # convert int32 (which is not serializable) to standard int
        action_to_apply = {key: int(value) for key, value in action.items()}

        action_to_apply['hor_move'] -= 5  # [0, 10] -> [-5, 5]
        action_to_apply['choose'] = state['pieces'][
            action_to_apply['choose']]  # index to letter

        return action_to_apply

    def evaluate_heuristics(self, heuristics, g_prec, g_next, action):
        """
        Computes the current values of the heuristic.
        :param heuristics: list containing the heuristic functions.
        :param g_prec: previous state.
        :param g_next: current state.
        :param action: action which allows to go from g_prec to g_next.
        :return: flat list containing the heuristics values (flattening is necessary because some heuristics are lists).
        """

        return flatten(
            [heuristic(g_prec, g_next, action) for heuristic in heuristics])

    def format_state(self, state):
        """
        Formats the state so that it can be used by tensorforce.
        :param state: dictionary containing information about the game, send by the server.
        :return: list containing the heuristics values. Represents the state.
        """

        state_bis = State.State(state['grid'])
        heuristics_values = self.evaluate_heuristics(self.heuristics, None,
                                                     state_bis, None)

        # selectable pieces as a one-shot vector
        pieces_one_hot = self.format_pieces(state['pieces'])

        # state used by tensorforce
        state_formatted = heuristics_values + pieces_one_hot

        print('{}, {}'.format(heuristics_values, pieces_one_hot))
        return state_formatted

    def format_pieces(self, pieces):
        """
        Formats the available pieces so that they can be used by tensorforce.
        :param pieces: 3-elements list containing letters representing pieces (no repetition).
        :return: 7-elements one-hot list containing 1 or 0.
        """

        pieces_formatted = [0] * NOMBRE_DE_PIECES

        for piece in pieces:
            pieces_formatted[self.char_to_int(piece)] = 1

        return pieces_formatted

    def format_score(self, state):
        """
        Extracts the score of the AI and of the other player.
        :param state: dictionary containing information about the game, send by the server.
        :return: score_self, score_other.
        """

        id_self = self.my_id_in_game
        id_other = (id_self + 1) % 2
        score_self = state['score'][id_self]
        score_other = state['score'][id_other]

        return score_self, score_other

    @staticmethod
    def char_to_int(char):
        """
        Converts a letter whose shape looks like a tetromino to a corresponding integer.
        :param char: 'O', 'I', 'L', 'T', 'S', 'Z' or 'J'.
        :return: integer from 0 to 6.
        """

        lu_table = {'O': 0, 'I': 1, 'L': 2, 'T': 3, 'S': 4, 'Z': 5, 'J': 6}
        return lu_table[char]

    async def train(self):
        """
        Triggers the training.
        """

        await super().init_train()
        if self.is_stats:
            self.my_stats = Stats.Stats()
            self.pid_stats = await self.my_stats.observe()

        for _ in range(self.nb_batches):
            wins = 0
            for _ in range(self.nb_games_per_batch):
                if self.is_stats:
                    await super().new_game(
                        players=[[self.my_client.pid, 1]],
                        ias=[[self.train_adversary_level, 1]],
                        viewers=[0, self.pid_stats])
                else:
                    await super().new_game(
                        players=[[self.my_client.pid, 1]],
                        ias=[[self.train_adversary_level, 1]],
                        viewers=[0])

                self.current_game_is_finish = False

                while not self.current_game_is_finish:
                    await asyncio.sleep(0)

                self.current_game_is_finish = False

                # increment wins when a game is won
                wins += 1 if self.score_self_new > self.score_other_new else 0

            # save the scores in a file
            self.file_scores.write('{}\n'.format(wins /
                                                 self.nb_games_per_batch))
            self.file_scores.flush()

        self.save()

    def save(self):
        """
        Saves the current model in directory rein_learn_models as 3 files.
        """

        #TODO: Dire si on a bien chargé
        # directory = os.path.join(os.getcwd(), 'rein_learn_models')
        time_str = time.strftime('%Y%m%d_%H%M%S')
        directory = os.path.join(os.getcwd(), 'rein_learn_models',
                                 'agent_' + time_str)
        checkpoint = self.agent.save_model(directory=directory,
                                           append_timestep=True)
        print('directory: {}'.format(directory))
        print('checkpoint: {}'.format(checkpoint))

    def load(self, load_file):
        """
        Loads a saved model.
        :param load_file: path and name of the model to load (without any extension).
        """

        # load_file represent the file path (without any extension)
        directory = os.path.dirname(load_file)
        file = os.path.basename(load_file)

        self.agent.restore_model(directory=directory, file=file)
Ejemplo n.º 8
0
    def test_blogpost_introduction_runner(self):
        from tensorforce.config import Configuration
        from tensorforce.core.networks import layered_network_builder
        from tensorforce.environments.minimal_test import MinimalTest
        from tensorforce.agents import DQNAgent
        from tensorforce.execution import Runner

        environment = MinimalTest(definition=False)

        network_config = [
            dict(type='dense', size=32)
        ]
        agent_config = Configuration(
            batch_size=8,
            learning_rate=0.001,
            memory_capacity=800,
            first_update=80,
            repeat_update=4,
            target_update_frequency=20,
            states=environment.states,
            actions=environment.actions,
            network=layered_network_builder(network_config)
        )

        agent = DQNAgent(config=agent_config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(runner):
            if runner.episode % 100 == 0:
                print(sum(runner.episode_rewards[-100:]) / 100)
            return runner.episode < 100 \
                   or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:])

        # runner.run(episodes=1000, episode_finished=episode_finished)
        runner.run(episodes=10, episode_finished=episode_finished)  # Only 10 episodes for this test

        ### Code block: next

        # max_episodes = 1000
        max_episodes = 10  # Only 10 episodes for this test
        max_timesteps = 2000

        episode = 0
        episode_rewards = list()

        while True:
            state = environment.reset()
            agent.reset()

            timestep = 0
            episode_reward = 0
            while True:
                action = agent.act(state=state)
                state, reward, terminal = environment.execute(action=action)
                agent.observe(reward=reward, terminal=terminal)

                timestep += 1
                episode_reward += reward

                if terminal or timestep == max_timesteps:
                    break

            episode += 1
            episode_rewards.append(episode_reward)

            if all(reward >= 1.0 for reward in episode_rewards[-100:]) \
                    or episode == max_episodes:
                break