def train(args): # Make environments, CFR only supports Leduc Holdem env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True}) eval_env = rlcard.make('leduc-holdem', config={'seed': 0}) # Seed numpy, torch, random set_seed(args.seed) # Initilize CFR Agent agent = CFRAgent(env, os.path.join(args.log_dir, 'cfr_model')) agent.load() # If we have saved model, we first load the model # Evaluate CFR against random eval_env.set_agents([agent, RandomAgent(num_actions=env.num_actions)]) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with Random agents. if episode % args.evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, args.num_eval_games)[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, 'cfr')
def run(args): # Make environment env = rlcard.make(args.env, config={'seed': 42}) # Seed numpy, torch, random set_seed(42) # Set agents agent = RandomAgent(num_actions=env.num_actions) env.set_agents([agent for _ in range(env.num_players)]) # Generate data from the environment trajectories, player_wins = env.run(is_training=False) # Print out the trajectories print('\nTrajectories:') print(trajectories) print('\nSample raw observation:') pprint.pprint(trajectories[0][0]['raw_obs']) print('\nSample raw legal_actions:') pprint.pprint(trajectories[0][0]['raw_legal_actions'])
def evaluate(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={'seed': args.seed}) # Load models agents = [] for position, model_path in enumerate(args.models): agents.append(load_model(model_path, env, position, device)) env.set_agents(agents) # Evaluate rewards = tournament(env, args.num_games) for position, reward in enumerate(rewards): print(position, args.models[position], reward)
def train(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env_func = env_name_to_env_func[args.env] env = env_func.env() env.seed(args.seed) env.reset() # Initialize the agent and use random agents as opponents learning_agent_name = env.agents[0] if args.algorithm == 'dqn': from rlcard.agents.pettingzoo_agents import DQNAgentPettingZoo agent = DQNAgentPettingZoo( num_actions=env.action_space(learning_agent_name).n, state_shape=env.observation_space( learning_agent_name)["observation"].shape, mlp_layers=[64, 64], device=device) elif args.algorithm == 'nfsp': from rlcard.agents.pettingzoo_agents import NFSPAgentPettingZoo agent = NFSPAgentPettingZoo( num_actions=env.action_space(learning_agent_name).n, state_shape=env.observation_space( learning_agent_name)["observation"].shape, hidden_layers_sizes=[64, 64], q_mlp_layers=[64, 64], device=device) agents = {learning_agent_name: agent} for i in range(1, env.num_agents): agents[env.agents[i]] = RandomAgentPettingZoo( num_actions=env.action_space(env.agents[i]).n) # Start training num_timesteps = 0 with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): if args.algorithm == 'nfsp': agent.sample_episode_policy() # Generate data from the environment trajectories = run_game_pettingzoo(env, agents, is_training=True) trajectories = reorganize_pettingzoo(trajectories) num_timesteps += sum([len(t) for t in trajectories.values()]) for ts in trajectories[learning_agent_name]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % args.evaluate_every == 0: average_rewards = tournament_pettingzoo( env, agents, args.num_eval_games) logger.log_performance(num_timesteps, average_rewards[learning_agent_name]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join(args.log_dir, 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path)
def train(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={ 'seed': args.seed, }) # Initialize the agent and use random agents as opponents if args.algorithm == 'dqn': from rlcard.agents import DQNAgent agent = DQNAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64], device=device, ) elif args.algorithm == 'nfsp': from rlcard.agents import NFSPAgent agent = NFSPAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], hidden_layers_sizes=[64, 64], q_mlp_layers=[64, 64], device=device, ) agents = [agent] for _ in range(1, env.num_players): agents.append(RandomAgent(num_actions=env.num_actions)) env.set_agents(agents) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): if args.algorithm == 'nfsp': agents[0].sample_episode_policy() # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % args.evaluate_every == 0: logger.log_performance( env.timestep, tournament( env, args.num_eval_games, )[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join(args.log_dir, 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path)
def main(): # Make environment device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4}) eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4}) # Set the iterations numbers and how frequently we evaluate performance evaluate_every = 5000 selfplay_every = 25000 evaluate_num = 10000 iteration_num = 8000000 # The intial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 agent = DQNAgent(num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64, 64, 64], device=device) agents = [agent, load_model("model.pth")] env.set_agents(agents) with Logger('./') as logger: for episode in range(iteration_num): # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(env, evaluate_num)[0]) if episode % selfplay_every == 0: save_path = os.path.join('./', str(episode) + "model.pth") torch.save(agent, save_path) print('Model saved in', save_path) agents = [agent, load_model(str(episode) + "model.pth")] env.set_agents(agents) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve #plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join('./', 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path) # The paths for saving the logs and learning curves log_dir = './experiments/nlh_cfr_result/' # Set a global seed set_seed(0)