Exemple #1
0
def main():
    """
    This function will be called for training phase.
    """
    # Sample code for illustration, add your code below to run in test phase.
    # Load trained model from train/ directory
    env = gym.make(MINERL_GYM_ENV)
    if FRAME_SKIP > 0:
        env = FrameSkip(env, enable_rendering=True)
    env = ObsWrapper(env)
    env = MoveAxisWrapper(env, -1, 0)
    env = CombineActionWrapper(env)

    agent = Agent(env.observation_space, env.action_space)
    agent.load_model()

    for _ in range(MINERL_MAX_EVALUATION_EPISODES):
        obs = env.reset()
        done = False
        netr = 0
        while not done:
            action = agent.act(obs)
            obs, reward, done, info = env.step(action)
            netr += reward
            env.render()

    env.close()
Exemple #2
0
def dqn(
    agent: Agent,
    env,
    brain_name,
    n_episodes: int = 10,
    eps_start: float = 1.0,
    eps_end: float = 0.01,
    eps_decay: float = 0.995,
):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        while True:
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print(
            f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}",
            end="",
        )
        if i_episode % 100 == 0:
            print(
                f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}"
            )
        if np.mean(scores_window) >= 13.0:
            print(
                f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:"
                f" {np.mean(scores_window):.2f}")
            torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth")
            break
    return scores
def ddpg(agent: Agent, env, brain_name, n_agents, n_episodes: int = 10):
    scores_window = deque(maxlen=100)
    scores_mean_agent = []
    scores_mean = []

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset()[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(n_agents)
        while True:

            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get the next state
            rewards = env_info.rewards
            dones = env_info.local_done
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            scores += rewards
            if np.any(dones):
                break
        score = np.mean(scores)
        scores_window.append(score)
        scores_mean_agent.append(score)
        scores_mean.append(np.mean(scores_window))

        print(
            f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}",
            end="",
        )
        if i_episode % 100 == 0:
            print(
                f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}"
            )
        if np.mean(scores_window) >= 30.0:
            print(
                f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:"
                f" {np.mean(scores_window):.2f}")
            torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth")
            break

        if np.mean(scores_window) >= 30.0:
            print(
                "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}"
                .format(i_episode, np.mean(scores_window)))
            torch.save(agent.policy_network_local.state_dict(),
                       "checkpoint_policy.pth")
            torch.save(agent.qnetwork_local.state_dict(),
                       "checkpoint_qnetwork.pth")
            print("saved networks")
            break
    return scores_mean_agent, scores_mean
Exemple #4
0
def main():

    window_size = 5
    episode_count = 10
    stock_name = "^GSPC_2011"

    agent = Agent(window_size)
    market = Market(window_size=window_size, stock_name=stock_name)

    batch_size = 32

    start_time = time.time()
    for e in range(episode_count + 1):
        print("Episodio" + str(e) + "/" + str(episode_count))
        agent.reset()
        state, price_data = market.reset()  # ToDo: get the initial state

        for t in range(market.last_data_index):
            # obtener acción actual del agente
            # llamar al método act() del agente considerando el estado actual
            action, bought_price = agent.act(state, price_data)

            # obtener siguiente estado del agente según el mercado
            next_state, next_price_data, reward, done =\
                market.get_next_state_reward(action, bought_price)

            # añadir trasacción a la memoria
            agent.memory.append((state, action, reward, next_state, done))
            # aprender de la historia solo en el caso que haya memoria
            if len(agent.memory) > batch_size:
                agent.experience_replay(batch_size)

            state = next_state
            price_data = next_price_data

            if done:
                print("--------------------------------")
                print("Ganancias totales: {0}".format(
                    agent.get_total_profit()))
                print("--------------------------------")

        if e % 10 == 0:
            if not os.path.exists("models"):
                os.mkdir("models")
            agent.model.save("models/model_rl" + str(e))

    end_time = time.time()
    training_time = round(end_time - start_time)
    print("Entrenamiento tomó {0} segundos.".format(training_time))
Exemple #5
0
	def test_act_tau_0(self):
		config = {
			'ALPHA': 0.8,
			'CPUCT': 1,
			'EPSILON': 0.2,
			'ACTION_SIZE': 32 * 4 * 7,
			'MCTS_SIMULATIONS': 3
		}
		action_encoder = ActionEncoder(DirectionResolver())
		agent = Agent(model=None, action_encoder=action_encoder, state_encoder=StateEncoder(), name='player1', config=config)
		game_root = Game()
		root_node = Node(game_root)

		child1 = Node(game_root.move(game_root.get_possible_moves()[0]))
		edge1 = Edge(root_node, child1, 0.33, 8)
		edge1.stats['N'] = 10
		edge1.stats['Q'] = 0.2

		root_node.edges.append(edge1)

		child2 = Node(game_root.move(game_root.get_possible_moves()[1]))
		edge2 = Edge(root_node, child2, 0.5, 104)
		edge2.stats['N'] = 20
		edge2.stats['Q'] = 0.5
		root_node.edges.append(edge2)

		child3 = Node(game_root.move(game_root.get_possible_moves()[2]))
		edge3 = Edge(root_node, child3, 0.17, 9)
		edge3.stats['N'] = 15
		edge3.stats['Q'] = 0.3
		root_node.edges.append(edge3)

		agent.prepare_mcts_for_next_action = MagicMock()
		mcts = MagicMock()
		mcts.root = root_node
		mcts.evaluate_leaf.return_value = 0.7
		agent.mcts = mcts
		mcts.move_to_leaf.return_value = (root_node, 0.5, False, [])

		action, pi, value = agent.act(game_root, tau=0)

		self.assertEqual(action, [9, 14])
		self.assertEqual(value, 0.5)
		self.assertEqual(pi[8], 10/(10 + 20 + 15))
		self.assertEqual(pi[9], 15/(10 + 20 + 15))
		self.assertEqual(pi[8 + 3*32], 20/(10 + 20 + 15))
Exemple #6
0
def main():
    """
    Evaluar el agente entrenado en un dataset de acciones en otro
    completamente diferente
    """
    stock_name = "GSPC_2011-03"
    model_name = "model_rl"
    # cargar pesos
    model = load_model("models/" + model_name)
    window_size = model.layers[0].input.shape.as_list()[1]

    agent = Agent(window_size, True, model_name)
    market = Market(window_size, stock_name)

    # Empezar desde un estado inicial
    state, price_data = market.reset()

    for t in range(market.last_data_index):

        # accion para el estado actual
        action, bought_price = agent.act(state, price_data)
        # verificar la acción para obtener recompensa y observar
        # el siguiente estado
        # obtener el siguiente estado
        next_state, next_price_data, reward, done =\
            market.get_next_state_reward(action, bought_price)
        # estado siguiente y ganancias totales
        state = next_state
        price_data = next_price_data
        if done:
            print("--------------------------------")
            print("{0} Ganancias totales: {1}".format(
                stock_name, agent.get_total_profit()))
            print("--------------------------------")
    plot_action_profit(market.data, agent.action_history,
                       agent.get_total_profit())
agent_ac = Agent(params, painter)

writer = SummaryWriter(log_dir=params['log_folder'])

all_mean_rewards = []
all_mean_actor_loss = []
all_mean_critic_loss = []
time_step = 0
for i_episode in range(params['num_episodes']):
    observed_map, robot_pose = grid_env.reset()
    done = False
    rewards = []
    actor_losses = []
    critic_losses = []
    while not done:
        action, action_log_prob = agent_ac.act(observed_map, robot_pose)
        observed_map_next, robot_pose_next, reward, done = grid_env.step(action)
        actor_loss, critic_loss = agent_ac.step(state=[observed_map, robot_pose], log_prob=action_log_prob,
                                                action=action, reward=reward,
                                                next_state=[observed_map_next, robot_pose_next], done=done)
        actor_losses.append(actor_loss)
        critic_losses.append(critic_loss)
        # 转到下一个状态
        observed_map = observed_map_next.copy()
        robot_pose = robot_pose_next.copy()

        time_step += 1

        if params['visualise']:
            painter.update()
            for event in pygame.event.get():
writer = SummaryWriter(log_dir=os.path.join("log"))

output_model_dir = "model"
if not os.path.exists(output_model_dir):
    os.makedirs(output_model_dir)

all_mean_rewards = []

for i_episode in range(num_episodes):
    observed_map, robot_pose = grid_env.reset()
    done = False
    rewards = []

    while not done:
        action = dqn_agent.act(observed_map, robot_pose)
        observed_map_next, robot_pose_next, reward, done = grid_env.step(
            action)
        dqn_agent.step(state=[observed_map, robot_pose],
                       action=action,
                       reward=reward,
                       next_state=[observed_map_next, robot_pose_next],
                       done=done)
        # print("action=", action, ";reward:", reward, ";done:", done)
        if visualise:
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
        rewards.append(reward)
        if done:
            if (i_episode + 1) % 10000 == 0:
Exemple #9
0
set_seed(1)

if __name__ == "__main__":
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)
    env = gym.make('CartPole-v1')
    cfg.state_space_dim = env.observation_space.shape[0]
    cfg.action_space_dim = env.action_space.n
    agent = Agent(**cfg)
    agent.load_dict()

    for episode in range(300):
        s0 = env.reset()
        total_reward = 1
        while True:
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1

            agent.put(s0, a0, r1, s1)

            if done:
                break

            total_reward += r1
            s0 = s1
            agent.learn()
        agent.load_dict()
        print("episode", episode, "total_reward", total_reward)
Exemple #10
0
from src.environment import Environment
from src.agent import Agent

if __name__ == "__main__":
    environment = Environment()
    agent = Agent(3, 5)
    loss = []
    episode = 1000
    for e in range(episode):
        state = environment.reset()
        state = np.reshape(state, (1, 5))
        score = 0
        max_steps = 1000
        for i in range(max_steps):
            action = agent.act(state)
            reward, next_state, done = environment.step(action)
            score += reward
            next_state = np.reshape(next_state, (1, 5))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            #agent.replay()
            if done:
                print(f"Episode {e}/{episode}, score: {score}")
                break
        agent.replay()
        loss.append(score)
    plt.plot([i for i in range(episode)], loss)
    plt.xlabel("episodes")
    plt.ylabel("rewards")
    plt.show()
Exemple #11
0
def main():
    writer = SummaryWriter()

    env = gym.make('MineRLObtainDiamondDense-v0')
    if FRAME_SKIP > 0:
        env = FrameSkip(env, FRAME_SKIP)
    env = ObsWrapper(env)
    env = MoveAxisWrapper(env, -1, 0)
    env = CombineActionWrapper(env)

    agent = Agent(env.observation_space, env.action_space)
    data = minerl.data.make('MineRLTreechop-v0', data_dir=MINERL_DATA_ROOT)
    data_source = data.sarsd_iter(num_epochs=-1, max_sequence_len=DATA_BATCH_SIZE)

    # data_2 = minerl.data.make('MineRLObtainDiamond-v0', data_dir=MINERL_DATA_ROOT)
    # data_2_source = data.sarsd_iter(num_epochs=-1, max_sequence_len=128)

    # behavioral cloning
    train_from_expert(agent, data_source)

    net_steps = 0
    n_episode = 0
    while True:
        obs = env.reset()
        done = False
        netr = 0
        net_bonus_r = 0
        nobs = None
        step = 0
        while not done:
            action = agent.act(obs)
            nobs, reward, done, info = env.step(action)
            netr += reward
            reward += agent.bonus_reward(obs, action, nobs)
            net_bonus_r += reward
            agent.add_data(obs, action, reward, nobs, done)
            obs = nobs

            # To get better view in your training phase, it is suggested
            # to register progress continuously, example when 54% completed
            # aicrowd_helper.register_progress(0.54)

            # To fetch latest information from instance manager, you can run below when you want to know the state
            #>> parser.update_information()
            #>> print(parser.payload)
            # .payload: provide AIcrowd generated json
            # Example: {'state': 'RUNNING', 'score': {'score': 0.0, 'score_secondary': 0.0}, 'instances': {'1': {'totalNumberSteps': 2001, 'totalNumberEpisodes': 0, 'currentEnvironment': 'MineRLObtainDiamond-v0', 'state': 'IN_PROGRESS', 'episodes': [{'numTicks': 2001, 'environment': 'MineRLObtainDiamond-v0', 'rewards': 0.0, 'state': 'IN_PROGRESS'}], 'score': {'score': 0.0, 'score_secondary': 0.0}}}}
            # .current_state: provide indepth state information avaiable as dictionary (key: instance id)

            step += 1
            net_steps += 1

            if (TRAIN_INTERVAL != 0 and step % TRAIN_INTERVAL == 0) or done:
                total_discrim_loss = 0.0
                total_value = total_ppo_loss = total_value_loss = total_entropy = 0
                n_epoch = 0
                while not agent.is_memory_empty():
                    s, a, _, _, _ = data_source.__next__()
                    s = data_state_wrapper(s)
                    a = data_action_wrapper(a)
                    total_discrim_loss += agent.train_discriminator(s, a)
                    value, ppo_loss, value_loss, entropy = agent.train_policy()

                    total_value += value
                    total_ppo_loss += ppo_loss
                    total_value_loss += value_loss
                    total_entropy += entropy
                    n_epoch += 1

                writer.add_scalar('Train/Value', value / n_epoch, net_steps)
                writer.add_scalar('Train/PolicyLoss', ppo_loss / n_epoch, net_steps)
                writer.add_scalar('Train/ValueLoss', value_loss / n_epoch, net_steps)
                writer.add_scalar('Train/Entropy', entropy / n_epoch, net_steps)
                writer.add_scalar('Train/DiscriminatorLoss', total_discrim_loss / n_epoch, net_steps)
                agent.save_model()

        writer.add_scalar('Reward/ExternalReward', netr, n_episode)
        writer.add_scalar('Reward/TotalReward', net_bonus_r, n_episode)
        n_episode += 1

        agent.save_model()

    agent.save_model()

    aicrowd_helper.register_progress(1)
    env.close()