def test_blogpost_introduction_runner(self): from tensorforce.environments.minimal_test import MinimalTest from tensorforce.agents import DQNAgent from tensorforce.execution import Runner environment = MinimalTest(specification=[('int', ())]) network_spec = [ dict(type='dense', size=32) ] agent = DQNAgent( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, first_update=100, target_sync_frequency=50 ) runner = Runner(agent=agent, environment=environment) def episode_finished(runner): if runner.episode % 100 == 0: print(sum(runner.episode_rewards[-100:]) / 100) return runner.episode < 100 \ or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:]) # runner.run(episodes=1000, episode_finished=episode_finished) runner.run(episodes=10, episode_finished=episode_finished) # Only 10 episodes for this test ### Code block: next agent = DQNAgent( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, first_update=100, target_sync_frequency=50 ) # max_episodes = 1000 max_episodes = 10 # Only 10 episodes for this test max_timesteps = 2000 episode = 0 episode_rewards = list() while True: state = environment.reset() agent.reset() timestep = 0 episode_reward = 0 while True: action = agent.act(states=state) state, terminal, reward = environment.execute(actions=action) agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: break episode += 1 episode_rewards.append(episode_reward) if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes: break agent.close() environment.close()
s = time.time() skip_steps = 8 g.flip_player() for i in range(100000): state = g.reset() while not g.is_terminal(): state = cv2.resize(state, (80, 80)) # Perform Action action = agent.act(state) actions[action] += 1 _, r, t, _ = g.step(action) # Add experience, agent automatically updates model according to batch size agent.observe(reward=r, terminal=t) g.flip_player() a2 = random.randint(0, g.get_action_space() - 1) _, _, t2, _ = g.step(a2) g.flip_player() for _ in range(skip_steps): g.update() # Re render and get state s1 = g.get_state() if t is True or t2 is True: if g.winner.id not in statistics: statistics[g.winner.id] = 1 else:
episode_plot = 1 episode_print = 1 avgreward_per_episode = [] profits_per_episode = [] avg_ep_reward = 0 for i in range(num_episodes): agent.reset() states = notebook.reset() terminal = False step_reward = [] print("Episode: " + str(i)) while not terminal: action = agent.act(states=states) position = notebook.position states, terminal, reward = notebook.execute(actions=action) agent.observe(reward=reward, terminal=terminal) if (step_print): print("TS: " + str(notebook.time_step) + " action: " + str(action) + " position: " + str(position) + " reward: " + str(reward) + " profit: " + str(notebook.profit) + " curr_price: " + str(notebook.curr_price) + " curr_cash: " + str(notebook.curr_cash)) step_reward.append(reward) final_profit = notebook.curr_cash - notebook.starting_cash if (episode_print): print(" FinalCash: " + str(notebook.curr_cash) + " Profit: " + str(notebook.curr_cash - notebook.starting_cash) + " MeanReward: " + str(np.array(step_reward).mean())) profits_per_episode.append(final_profit) if (step_plot): plt.figure(figsize=(20, 12))
if infrastructure.attempts < infrastructure.peers: agent_ppo.observe(reward=reward, terminal=False) else: agent_ppo.observe(reward=reward, terminal=True) rl_ppo.append(reward) #dqn agent action = agent_dqn.act(state) action = action.values() reward = infrastructure.monkey(action) if infrastructure.attempts < infrastructure.peers: agent_dqn.observe(reward=reward, terminal=False) else: agent_dqn.observe(reward=reward, terminal=True) rl_dqn.append(reward) #trpo agent action = agent_vpg.act(state) action = action.values() reward = infrastructure.monkey(action) if infrastructure.attempts < infrastructure.peers: agent_vpg.observe(reward=reward, terminal=False) else: agent_vpg.observe(reward=reward, terminal=True)
state = dict() state['image'] = observation, state['previous_act'] = GazeboMaze.vel_cmd, state['relative_pos'] = GazeboMaze.p, # state = dict(image=observation, previous_act=GazeboMaze.vel_cmd, relative_pos=GazeboMaze.p) # print(state) # Query the agent for its action decision action = agent.act(state) print(action) # Execute the decision and retrieve the current information observation, terminal, reward = GazeboMaze.execute(action) observation = observation / 255.0 # normalize # print(reward) # Pass feedback about performance (and termination) to the agent agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: success = GazeboMaze.success break episode += 1 total_timestep += timestep # avg_reward = float(episode_reward)/timestep successes.append(success) episode_rewards.append([episode_reward, timestep, success]) # if total_timestep > 100000: # print('{}th episode reward: {}'.format(episode, episode_reward))
class Reinforcement(ClientInterface.ClientInterface): def __init__(self, name, load_file=None, is_stats=False, file_stats=None, train_adversary_level=2, nb_batches=5000, nb_games_per_batch=2, layer_size=15, nb_layers=3): """ :param name: name of the IA/ :param load_file: path and name of the model to load (without any extension). :param is_stats: boolean which tells whether the statistics are enabled. :param file_stats: name of the file where the statistics are written. :param train_adversary_level: integer indicating the AI to train against (corresponds to level in AICreator). :param nb_batches: number of batches. A batch is a group of successive games on which the ratio (nb_won_games / nb_games_per_batch) is computed and saved in score.txt. :param nb_games_per_batch: number of games per batch. :param layer_size: size of a neural network layer. :param nb_layers: number of layers in the neural network. """ super().__init__(name, load_file) self.current_game_is_finish = None self.first_game = True # score self.score_self_old, self.score_self_new = 0, 0 self.score_other_old, self.score_other_new = 0, 0 self.file_scores = open('scores.txt', 'w') # AI parameters self.heuristics = [ Heuristic.line_transition, Heuristic.column_transition, Heuristic.hidden_empty_cells, Heuristic.wells, Heuristic.holes, Heuristic.highest_column, Heuristic.columns_heights ] state = State.State() heuristics_sizes = [ heuristic(state, state, None) for heuristic in self.heuristics ] self.nb_heuristics = len(flatten(heuristics_sizes)) print('self.nb_heuristics', heuristics_sizes) self.train_adversary_level = train_adversary_level # iteration self.nb_batches = nb_batches self.nb_games_per_batch = nb_games_per_batch self.iteration = 0 # neural network self.layer_size = layer_size self.nb_layers = nb_layers network_spec = [ dict(type='dense', size=self.layer_size, activation='relu') ] * self.nb_layers self.agent = DQNAgent(states_spec={ 'shape': (self.nb_heuristics + NOMBRE_DE_PIECES, ), 'type': 'float' }, actions_spec={ 'hor_move': { 'type': 'int', 'num_actions': 11 }, 'rotate': { 'type': 'int', 'num_actions': 4 }, 'choose': { 'type': 'int', 'num_actions': 3 } }, network_spec=network_spec) # loading of a saved model if load_file is not None: self.load(load_file) # stats self.is_stats = is_stats self.my_stats = None self.file_stats = file_stats self.pid_stats = None async def play(self, state): """ Associates an action to a state. Called by the server. :param state: dictionary containing information about the game, send by the server. :return: action to apply. """ # update all the scores (self.score_self_new, self.score_self_old, self.score_other_new, self.score_other_old) self.update_scores(state) # format the state to make it compatible with tensorforce state_formatted = self.format_state(state) if self.first_game: # at the first game and first call to function play, no action has been performed yet -> # nothing to observe self.first_game = False self.agent.reset() else: # pass observation to the agent terminal = False reward = (self.score_self_new - self.score_self_old) - ( self.score_other_new - self.score_other_old) self.agent.observe(terminal, reward) # select the action (exploitation or exploration) action = self.agent.act(state_formatted) # format the action to make it exploitable by the Tetris game action_to_apply = self.format_action(action, state) return action_to_apply # return {"hor_move": -2, "rotate": 1, "choose": state["pieces"][0]} def on_init_game(self, data): """ Called at the beginning of a game. :param data: dictionary containing information about the game, send by the server. """ print() print(self.iteration) self.my_id_in_game = data["ids_in_game"][0] def on_finished_game(self, data): """ Called at the end on a game. :param data: dictionary containing information about the game, send by the server. """ self.iteration += 1 self.current_game_is_finish = True # update all the scores self.update_scores(data) # pass observation to the agent terminal = True reward = (self.score_self_new - self.score_self_old) - ( self.score_other_new - self.score_other_old) self.agent.observe(terminal, reward) def update_scores(self, state): """ Updates the scores of the agent and of the the other player. :param state: dictionary containing information about the game. """ # update the old scores self.score_self_old, self.score_other_old = self.score_other_new, self.score_other_new # get the new scores self.score_self_new, self.score_other_new = self.format_score(state) @staticmethod def format_action(action, state): """ Formats the action returned by tensorforce so that it can be used in the play function. :param action: action returned by tensorforce (function act). :param state: dictionary containing information about the game, send by the server. :return: dictionary containing the action. """ # convert int32 (which is not serializable) to standard int action_to_apply = {key: int(value) for key, value in action.items()} action_to_apply['hor_move'] -= 5 # [0, 10] -> [-5, 5] action_to_apply['choose'] = state['pieces'][ action_to_apply['choose']] # index to letter return action_to_apply def evaluate_heuristics(self, heuristics, g_prec, g_next, action): """ Computes the current values of the heuristic. :param heuristics: list containing the heuristic functions. :param g_prec: previous state. :param g_next: current state. :param action: action which allows to go from g_prec to g_next. :return: flat list containing the heuristics values (flattening is necessary because some heuristics are lists). """ return flatten( [heuristic(g_prec, g_next, action) for heuristic in heuristics]) def format_state(self, state): """ Formats the state so that it can be used by tensorforce. :param state: dictionary containing information about the game, send by the server. :return: list containing the heuristics values. Represents the state. """ state_bis = State.State(state['grid']) heuristics_values = self.evaluate_heuristics(self.heuristics, None, state_bis, None) # selectable pieces as a one-shot vector pieces_one_hot = self.format_pieces(state['pieces']) # state used by tensorforce state_formatted = heuristics_values + pieces_one_hot print('{}, {}'.format(heuristics_values, pieces_one_hot)) return state_formatted def format_pieces(self, pieces): """ Formats the available pieces so that they can be used by tensorforce. :param pieces: 3-elements list containing letters representing pieces (no repetition). :return: 7-elements one-hot list containing 1 or 0. """ pieces_formatted = [0] * NOMBRE_DE_PIECES for piece in pieces: pieces_formatted[self.char_to_int(piece)] = 1 return pieces_formatted def format_score(self, state): """ Extracts the score of the AI and of the other player. :param state: dictionary containing information about the game, send by the server. :return: score_self, score_other. """ id_self = self.my_id_in_game id_other = (id_self + 1) % 2 score_self = state['score'][id_self] score_other = state['score'][id_other] return score_self, score_other @staticmethod def char_to_int(char): """ Converts a letter whose shape looks like a tetromino to a corresponding integer. :param char: 'O', 'I', 'L', 'T', 'S', 'Z' or 'J'. :return: integer from 0 to 6. """ lu_table = {'O': 0, 'I': 1, 'L': 2, 'T': 3, 'S': 4, 'Z': 5, 'J': 6} return lu_table[char] async def train(self): """ Triggers the training. """ await super().init_train() if self.is_stats: self.my_stats = Stats.Stats() self.pid_stats = await self.my_stats.observe() for _ in range(self.nb_batches): wins = 0 for _ in range(self.nb_games_per_batch): if self.is_stats: await super().new_game( players=[[self.my_client.pid, 1]], ias=[[self.train_adversary_level, 1]], viewers=[0, self.pid_stats]) else: await super().new_game( players=[[self.my_client.pid, 1]], ias=[[self.train_adversary_level, 1]], viewers=[0]) self.current_game_is_finish = False while not self.current_game_is_finish: await asyncio.sleep(0) self.current_game_is_finish = False # increment wins when a game is won wins += 1 if self.score_self_new > self.score_other_new else 0 # save the scores in a file self.file_scores.write('{}\n'.format(wins / self.nb_games_per_batch)) self.file_scores.flush() self.save() def save(self): """ Saves the current model in directory rein_learn_models as 3 files. """ #TODO: Dire si on a bien chargé # directory = os.path.join(os.getcwd(), 'rein_learn_models') time_str = time.strftime('%Y%m%d_%H%M%S') directory = os.path.join(os.getcwd(), 'rein_learn_models', 'agent_' + time_str) checkpoint = self.agent.save_model(directory=directory, append_timestep=True) print('directory: {}'.format(directory)) print('checkpoint: {}'.format(checkpoint)) def load(self, load_file): """ Loads a saved model. :param load_file: path and name of the model to load (without any extension). """ # load_file represent the file path (without any extension) directory = os.path.dirname(load_file) file = os.path.basename(load_file) self.agent.restore_model(directory=directory, file=file)
def test_blogpost_introduction_runner(self): from tensorforce.config import Configuration from tensorforce.core.networks import layered_network_builder from tensorforce.environments.minimal_test import MinimalTest from tensorforce.agents import DQNAgent from tensorforce.execution import Runner environment = MinimalTest(definition=False) network_config = [ dict(type='dense', size=32) ] agent_config = Configuration( batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder(network_config) ) agent = DQNAgent(config=agent_config) runner = Runner(agent=agent, environment=environment) def episode_finished(runner): if runner.episode % 100 == 0: print(sum(runner.episode_rewards[-100:]) / 100) return runner.episode < 100 \ or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:]) # runner.run(episodes=1000, episode_finished=episode_finished) runner.run(episodes=10, episode_finished=episode_finished) # Only 10 episodes for this test ### Code block: next # max_episodes = 1000 max_episodes = 10 # Only 10 episodes for this test max_timesteps = 2000 episode = 0 episode_rewards = list() while True: state = environment.reset() agent.reset() timestep = 0 episode_reward = 0 while True: action = agent.act(state=state) state, reward, terminal = environment.execute(action=action) agent.observe(reward=reward, terminal=terminal) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: break episode += 1 episode_rewards.append(episode_reward) if all(reward >= 1.0 for reward in episode_rewards[-100:]) \ or episode == max_episodes: break