Esempio n. 1
0
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={
        'seed': args.seed,
    })

    # Initialize the agent and use random agents as opponents
    if args.algorithm == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64, 64],
            device=device,
        )
    elif args.algorithm == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device,
        )
    agents = [agent]
    for _ in range(1, env.num_players):
        agents.append(RandomAgent(num_actions=env.num_actions))
    env.set_agents(agents)

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agents[0].sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
Esempio n. 2
0
    def run(self, is_training=False):
        ''' Run X complete games, where X is the number of environemnts.
            The input/output are similar to Env. The difference is that
            The transitions for each player are stacked over the environments
        '''
        trajectories = [[[] for _ in range(self.player_num)] for _ in range(self.num)]
        ready_trajectories = [None for _ in range(self.num)]
        active_remotes = [remote for remote in self.remotes]
        mapping = [i for i in range(self.num)]
        active_num = self.num

        # Reset
        states = []
        player_ids = []
        for state, player_id in send_command_to_all(active_remotes, ('reset', None)):
            states.append(state)
            player_ids.append(player_id)
        for i in range(active_num):
            trajectories[i][player_ids[i]].append(states[i])

        # Loop until all the environments are over
        while active_num > 0:
            # Agent playes
            # TODO: Currently we naively feed one obs to the agent. This can be improved via batch
            commands = []
            actions = []
            for i in range(active_num):
                opt = 'raw_step' if self.agents[player_ids[i]].use_raw else 'step'
                if not is_training:
                    action, _ = self.agents[player_ids[i]].eval_step(states[i])
                else:
                    action = self.agents[player_ids[i]].step(states[i])
                commands.append((opt, action))
                actions.append(action)

            # Environment steps
            next_states, next_player_ids, dones = [], [], []
            for next_state, next_player_id, done in send_commands_to_all(active_remotes, commands):
                next_states.append(next_state)
                next_player_ids.append(next_player_id)
                dones.append(done)

            # Save action
            for i in range(active_num):
                trajectories[i][player_ids[i]].append(actions[i])

            # Set the state and player
            states = next_states
            player_ids = next_player_ids

            # Save state
            finished = []
            for i in range(active_num):
                if dones[i]:
                    # Add a final state to all the players
                    for j in range(self.player_num):
                        active_remotes[i].send(('get_state', j))
                        trajectories[i][j].append(active_remotes[i].recv())

                    # Save the ready trajectories and mark them as finished
                    ready_trajectories[mapping[i]] = trajectories[i]
                    finished.append(i)
                else:
                    trajectories[i][player_ids[i]].append(states[i])


            # Pop out the finished ones
            trajectories = [trajectories[i] for i in range(active_num) if i not in finished]
            mapping = [mapping[i] for i in range(active_num) if i not in finished]
            active_remotes = [active_remotes[i] for i in range(active_num) if i not in finished]
            states = [states[i] for i in range(active_num) if i not in finished]
            player_ids = [player_ids[i] for i in range(active_num) if i not in finished]

            self.timestep += active_num
            active_num -= len(finished)

        # Payoffs
        payoffs = send_command_to_all(self.remotes, ('get_payoffs', None))

        for i in range(self.num):
            ready_trajectories[i] = reorganize(ready_trajectories[i], payoffs[i])

        trajectories = [[] for _ in range(self.player_num)]
        for trs in ready_trajectories:
            for i in range(self.player_num):
                trajectories[i].extend(trs[i])
        return trajectories, payoffs
Esempio n. 3
0
def main():
    # Make environment
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})
    eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})

    # Set the iterations numbers and how frequently we evaluate performance
    evaluate_every = 5000
    selfplay_every = 25000
    evaluate_num = 10000
    iteration_num = 8000000

    # The intial memory size
    memory_init_size = 100

    # Train the agent every X steps
    train_every = 1

    agent = DQNAgent(num_actions=env.num_actions,
                     state_shape=env.state_shape[0],
                     mlp_layers=[64, 64, 64, 64],
                     device=device)

    agents = [agent, load_model("model.pth")]

    env.set_agents(agents)

    with Logger('./') as logger:
        for episode in range(iteration_num):

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(env, evaluate_num)[0])
            if episode % selfplay_every == 0:
                save_path = os.path.join('./', str(episode) + "model.pth")
                torch.save(agent, save_path)
                print('Model saved in', save_path)
                agents = [agent, load_model(str(episode) + "model.pth")]
                env.set_agents(agents)

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    #plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join('./', 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nlh_cfr_result/'

    # Set a global seed
    set_seed(0)