def main(): """ This function will be called for training phase. """ # Sample code for illustration, add your code below to run in test phase. # Load trained model from train/ directory env = gym.make(MINERL_GYM_ENV) if FRAME_SKIP > 0: env = FrameSkip(env, enable_rendering=True) env = ObsWrapper(env) env = MoveAxisWrapper(env, -1, 0) env = CombineActionWrapper(env) agent = Agent(env.observation_space, env.action_space) agent.load_model() for _ in range(MINERL_MAX_EVALUATION_EPISODES): obs = env.reset() done = False netr = 0 while not done: action = agent.act(obs) obs, reward, done, info = env.step(action) netr += reward env.render() env.close()
def dqn( agent: Agent, env, brain_name, n_episodes: int = 10, eps_start: float = 1.0, eps_end: float = 0.01, eps_decay: float = 0.995, ): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 while True: action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}", end="", ) if i_episode % 100 == 0: print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}" ) if np.mean(scores_window) >= 13.0: print( f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:" f" {np.mean(scores_window):.2f}") torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth") break return scores
def ddpg(agent: Agent, env, brain_name, n_agents, n_episodes: int = 10): scores_window = deque(maxlen=100) scores_mean_agent = [] scores_mean = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset()[brain_name] states = env_info.vector_observations scores = np.zeros(n_agents) while True: actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next state rewards = env_info.rewards dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states scores += rewards if np.any(dones): break score = np.mean(scores) scores_window.append(score) scores_mean_agent.append(score) scores_mean.append(np.mean(scores_window)) print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}", end="", ) if i_episode % 100 == 0: print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}" ) if np.mean(scores_window) >= 30.0: print( f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:" f" {np.mean(scores_window):.2f}") torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth") break if np.mean(scores_window) >= 30.0: print( "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}" .format(i_episode, np.mean(scores_window))) torch.save(agent.policy_network_local.state_dict(), "checkpoint_policy.pth") torch.save(agent.qnetwork_local.state_dict(), "checkpoint_qnetwork.pth") print("saved networks") break return scores_mean_agent, scores_mean
def main(): window_size = 5 episode_count = 10 stock_name = "^GSPC_2011" agent = Agent(window_size) market = Market(window_size=window_size, stock_name=stock_name) batch_size = 32 start_time = time.time() for e in range(episode_count + 1): print("Episodio" + str(e) + "/" + str(episode_count)) agent.reset() state, price_data = market.reset() # ToDo: get the initial state for t in range(market.last_data_index): # obtener acción actual del agente # llamar al método act() del agente considerando el estado actual action, bought_price = agent.act(state, price_data) # obtener siguiente estado del agente según el mercado next_state, next_price_data, reward, done =\ market.get_next_state_reward(action, bought_price) # añadir trasacción a la memoria agent.memory.append((state, action, reward, next_state, done)) # aprender de la historia solo en el caso que haya memoria if len(agent.memory) > batch_size: agent.experience_replay(batch_size) state = next_state price_data = next_price_data if done: print("--------------------------------") print("Ganancias totales: {0}".format( agent.get_total_profit())) print("--------------------------------") if e % 10 == 0: if not os.path.exists("models"): os.mkdir("models") agent.model.save("models/model_rl" + str(e)) end_time = time.time() training_time = round(end_time - start_time) print("Entrenamiento tomó {0} segundos.".format(training_time))
def test_act_tau_0(self): config = { 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2, 'ACTION_SIZE': 32 * 4 * 7, 'MCTS_SIMULATIONS': 3 } action_encoder = ActionEncoder(DirectionResolver()) agent = Agent(model=None, action_encoder=action_encoder, state_encoder=StateEncoder(), name='player1', config=config) game_root = Game() root_node = Node(game_root) child1 = Node(game_root.move(game_root.get_possible_moves()[0])) edge1 = Edge(root_node, child1, 0.33, 8) edge1.stats['N'] = 10 edge1.stats['Q'] = 0.2 root_node.edges.append(edge1) child2 = Node(game_root.move(game_root.get_possible_moves()[1])) edge2 = Edge(root_node, child2, 0.5, 104) edge2.stats['N'] = 20 edge2.stats['Q'] = 0.5 root_node.edges.append(edge2) child3 = Node(game_root.move(game_root.get_possible_moves()[2])) edge3 = Edge(root_node, child3, 0.17, 9) edge3.stats['N'] = 15 edge3.stats['Q'] = 0.3 root_node.edges.append(edge3) agent.prepare_mcts_for_next_action = MagicMock() mcts = MagicMock() mcts.root = root_node mcts.evaluate_leaf.return_value = 0.7 agent.mcts = mcts mcts.move_to_leaf.return_value = (root_node, 0.5, False, []) action, pi, value = agent.act(game_root, tau=0) self.assertEqual(action, [9, 14]) self.assertEqual(value, 0.5) self.assertEqual(pi[8], 10/(10 + 20 + 15)) self.assertEqual(pi[9], 15/(10 + 20 + 15)) self.assertEqual(pi[8 + 3*32], 20/(10 + 20 + 15))
def main(): """ Evaluar el agente entrenado en un dataset de acciones en otro completamente diferente """ stock_name = "GSPC_2011-03" model_name = "model_rl" # cargar pesos model = load_model("models/" + model_name) window_size = model.layers[0].input.shape.as_list()[1] agent = Agent(window_size, True, model_name) market = Market(window_size, stock_name) # Empezar desde un estado inicial state, price_data = market.reset() for t in range(market.last_data_index): # accion para el estado actual action, bought_price = agent.act(state, price_data) # verificar la acción para obtener recompensa y observar # el siguiente estado # obtener el siguiente estado next_state, next_price_data, reward, done =\ market.get_next_state_reward(action, bought_price) # estado siguiente y ganancias totales state = next_state price_data = next_price_data if done: print("--------------------------------") print("{0} Ganancias totales: {1}".format( stock_name, agent.get_total_profit())) print("--------------------------------") plot_action_profit(market.data, agent.action_history, agent.get_total_profit())
agent_ac = Agent(params, painter) writer = SummaryWriter(log_dir=params['log_folder']) all_mean_rewards = [] all_mean_actor_loss = [] all_mean_critic_loss = [] time_step = 0 for i_episode in range(params['num_episodes']): observed_map, robot_pose = grid_env.reset() done = False rewards = [] actor_losses = [] critic_losses = [] while not done: action, action_log_prob = agent_ac.act(observed_map, robot_pose) observed_map_next, robot_pose_next, reward, done = grid_env.step(action) actor_loss, critic_loss = agent_ac.step(state=[observed_map, robot_pose], log_prob=action_log_prob, action=action, reward=reward, next_state=[observed_map_next, robot_pose_next], done=done) actor_losses.append(actor_loss) critic_losses.append(critic_loss) # 转到下一个状态 observed_map = observed_map_next.copy() robot_pose = robot_pose_next.copy() time_step += 1 if params['visualise']: painter.update() for event in pygame.event.get():
writer = SummaryWriter(log_dir=os.path.join("log")) output_model_dir = "model" if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) all_mean_rewards = [] for i_episode in range(num_episodes): observed_map, robot_pose = grid_env.reset() done = False rewards = [] while not done: action = dqn_agent.act(observed_map, robot_pose) observed_map_next, robot_pose_next, reward, done = grid_env.step( action) dqn_agent.step(state=[observed_map, robot_pose], action=action, reward=reward, next_state=[observed_map_next, robot_pose_next], done=done) # print("action=", action, ";reward:", reward, ";done:", done) if visualise: for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() rewards.append(reward) if done: if (i_episode + 1) % 10000 == 0:
set_seed(1) if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) env = gym.make('CartPole-v1') cfg.state_space_dim = env.observation_space.shape[0] cfg.action_space_dim = env.action_space.n agent = Agent(**cfg) agent.load_dict() for episode in range(300): s0 = env.reset() total_reward = 1 while True: a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) if done: r1 = -1 agent.put(s0, a0, r1, s1) if done: break total_reward += r1 s0 = s1 agent.learn() agent.load_dict() print("episode", episode, "total_reward", total_reward)
from src.environment import Environment from src.agent import Agent if __name__ == "__main__": environment = Environment() agent = Agent(3, 5) loss = [] episode = 1000 for e in range(episode): state = environment.reset() state = np.reshape(state, (1, 5)) score = 0 max_steps = 1000 for i in range(max_steps): action = agent.act(state) reward, next_state, done = environment.step(action) score += reward next_state = np.reshape(next_state, (1, 5)) agent.remember(state, action, reward, next_state, done) state = next_state #agent.replay() if done: print(f"Episode {e}/{episode}, score: {score}") break agent.replay() loss.append(score) plt.plot([i for i in range(episode)], loss) plt.xlabel("episodes") plt.ylabel("rewards") plt.show()
def main(): writer = SummaryWriter() env = gym.make('MineRLObtainDiamondDense-v0') if FRAME_SKIP > 0: env = FrameSkip(env, FRAME_SKIP) env = ObsWrapper(env) env = MoveAxisWrapper(env, -1, 0) env = CombineActionWrapper(env) agent = Agent(env.observation_space, env.action_space) data = minerl.data.make('MineRLTreechop-v0', data_dir=MINERL_DATA_ROOT) data_source = data.sarsd_iter(num_epochs=-1, max_sequence_len=DATA_BATCH_SIZE) # data_2 = minerl.data.make('MineRLObtainDiamond-v0', data_dir=MINERL_DATA_ROOT) # data_2_source = data.sarsd_iter(num_epochs=-1, max_sequence_len=128) # behavioral cloning train_from_expert(agent, data_source) net_steps = 0 n_episode = 0 while True: obs = env.reset() done = False netr = 0 net_bonus_r = 0 nobs = None step = 0 while not done: action = agent.act(obs) nobs, reward, done, info = env.step(action) netr += reward reward += agent.bonus_reward(obs, action, nobs) net_bonus_r += reward agent.add_data(obs, action, reward, nobs, done) obs = nobs # To get better view in your training phase, it is suggested # to register progress continuously, example when 54% completed # aicrowd_helper.register_progress(0.54) # To fetch latest information from instance manager, you can run below when you want to know the state #>> parser.update_information() #>> print(parser.payload) # .payload: provide AIcrowd generated json # Example: {'state': 'RUNNING', 'score': {'score': 0.0, 'score_secondary': 0.0}, 'instances': {'1': {'totalNumberSteps': 2001, 'totalNumberEpisodes': 0, 'currentEnvironment': 'MineRLObtainDiamond-v0', 'state': 'IN_PROGRESS', 'episodes': [{'numTicks': 2001, 'environment': 'MineRLObtainDiamond-v0', 'rewards': 0.0, 'state': 'IN_PROGRESS'}], 'score': {'score': 0.0, 'score_secondary': 0.0}}}} # .current_state: provide indepth state information avaiable as dictionary (key: instance id) step += 1 net_steps += 1 if (TRAIN_INTERVAL != 0 and step % TRAIN_INTERVAL == 0) or done: total_discrim_loss = 0.0 total_value = total_ppo_loss = total_value_loss = total_entropy = 0 n_epoch = 0 while not agent.is_memory_empty(): s, a, _, _, _ = data_source.__next__() s = data_state_wrapper(s) a = data_action_wrapper(a) total_discrim_loss += agent.train_discriminator(s, a) value, ppo_loss, value_loss, entropy = agent.train_policy() total_value += value total_ppo_loss += ppo_loss total_value_loss += value_loss total_entropy += entropy n_epoch += 1 writer.add_scalar('Train/Value', value / n_epoch, net_steps) writer.add_scalar('Train/PolicyLoss', ppo_loss / n_epoch, net_steps) writer.add_scalar('Train/ValueLoss', value_loss / n_epoch, net_steps) writer.add_scalar('Train/Entropy', entropy / n_epoch, net_steps) writer.add_scalar('Train/DiscriminatorLoss', total_discrim_loss / n_epoch, net_steps) agent.save_model() writer.add_scalar('Reward/ExternalReward', netr, n_episode) writer.add_scalar('Reward/TotalReward', net_bonus_r, n_episode) n_episode += 1 agent.save_model() agent.save_model() aicrowd_helper.register_progress(1) env.close()