ctrl = Driver(t_prof=TrainingProfile( # name="SD-CFR_LEDUC_EXAMPLE", # nn_type="feedforward", # n_learner_actor_workers=N_WORKERS, # max_buffer_size_adv=3e6, # eval_agent_export_freq=20, # export API to play against the agent # n_traversals_per_iter=1500, # n_batches_adv_training=750, # n_batches_avrg_training=2000, # n_merge_and_table_layer_units_adv=64, # n_merge_and_table_layer_units_avrg=64, # n_units_final_adv=64, # n_units_final_avrg=64, # mini_batch_size_adv=2048, # mini_batch_size_avrg=2048, # init_adv_model="last", # init_avrg_model="last", # use_pre_layers_adv=False, # use_pre_layers_avrg=False, eval_agent_export_freq=9999999, # Don't export max_buffer_size_adv=3.636e5, # 364k * 11 = ~4M max_buffer_size_avrg=3.636e5, # 364k * 11 = ~4M n_traversals_per_iter=5, # 800 * 11 = 8,800 n_batches_adv_training=12, n_batches_avrg_training=100, # trained far more than necessary n_merge_and_table_layer_units_adv=64, n_merge_and_table_layer_units_avrg=64, n_units_final_adv=64, n_units_final_avrg=64, n_cards_state_units_adv=64, n_cards_state_units_avrg=64, mini_batch_size_adv=8, # 256 * 11 = 2,816 mini_batch_size_avrg=16, # 512 * 11 = 5,632 init_adv_model="last", # warm start neural weights with init from last iter init_avrg_model="random", use_pre_layers_adv=True, use_pre_layers_avrg=True, name="DISNLHT_DISTRIBUTED_LH_RNN", DISTRIBUTED=False, n_learner_actor_workers=N_WORKERS, nn_type="recurrent", game_cls=DiscretizedNLHoldem, agent_bet_set=bet_sets.B_5, # checkpoint_freq=1, # use_pre_layers_br=True, # use_pre_layers_avg=True, # n_units_final_br=64, # n_units_final_avg=64, # n_merge_and_table_layer_units_br=64, # n_merge_and_table_layer_units_avg=64, # rnn_units_br=64, # rnn_units_avg=64, # n_cards_state_units_br=128, # n_cards_state_units_avg=128, # # cir_buf_size_each_la=6e5 / N_WORKERS, # res_buf_size_each_la=2e6, # n_envs=128, # n_steps_per_iter_per_la=128, # # lr_br=0.1, # lr_avg=0.01, # # mini_batch_size_br_per_la=64, # mini_batch_size_avg_per_la=64, # n_br_updates_per_iter=1, # n_avg_updates_per_iter=1, # # eps_start=0.08, # eps_const=0.007, # eps_exponent=0.5, # eps_min=0.0, lbr_args=LBRArgs( lbr_bet_set=bet_sets.B_5, n_lbr_hands_per_seat=8, lbr_check_to_round=Poker.TURN, # lbr_check_to_round=None, n_parallel_lbr_workers=N_LBR_WORKERS, use_gpu_for_batch_eval=False, DISTRIBUTED=True, ) ), eval_methods={"lbr": 10}, n_iterations=50)
ctrl = Driver(t_prof=TrainingProfile(name="MO_LEDUC_BigLeduc_LBR", nn_type="feedforward", eval_agent_export_freq=3, checkpoint_freq=3, n_learner_actor_workers=5, max_buffer_size_adv=1e6, n_traversals_per_iter=500, n_batches_adv_training=250, mini_batch_size_adv=2048, game_cls=BigLeduc, n_units_final_adv=64, n_merge_and_table_layer_units_adv=64, init_adv_model="random", # warm start neural weights with init from last iter use_pre_layers_adv=False, # shallower nets use_pre_layers_avrg=False, # shallower nets # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR ), DISTRIBUTED=True, log_verbose=True, rl_br_args=RLBRArgs(rlbr_bet_set=None, n_hands_each_seat=200, n_workers=1, # Training DISTRIBUTED=False, n_iterations=100, play_n_games_per_iter=50, # The DDQN batch_size=512, ), lbr_args=LBRArgs(n_lbr_hands_per_seat=30000, n_parallel_lbr_workers=10, DISTRIBUTED=True, ), ), eval_methods={'br': 1, #'rlbr': 1, 'lbr': 1, }, n_iterations=12)
ctrl = Driver( iteration_to_import=30, name_to_import='NLH_1.5m_10mX2-b2048-last-patience200-Leaky-lr0.004_', t_prof=TrainingProfile( name="NLH_1.5m_10mX2-b2048-last-patience200-Leaky-lr0.004", nn_type="feedforward", DISTRIBUTED=False, CLUSTER=False, n_learner_actor_workers=1, # 20 workers max_buffer_size_adv=1500000, # 1.5e6 export_each_net=False, # path_strategy_nets="", checkpoint_freq=5, # produces A SHITLOAD of Gbs! eval_agent_export_freq=1, # produces GBs! # How many actions out of all legal on current step to branch randomly = action bredth limit n_actions_traverser_samples=4, # 3 is the default, 4 is the current max for b_2 # number of traversals gives some amount of otcomes to train network on # mult = 1...4, buffer appends every() step with new data n_traversals_per_iter=3500, # number of mini_batch fetches and model updates on each step n_batches_adv_training=6000, # 5000 use_pre_layers_adv=True, n_cards_state_units_adv=192, n_merge_and_table_layer_units_adv=64, # 64 n_units_final_adv=64, # 64 dropout_adv=0.0, lr_patience_adv=750, # decrease by a factor 0.5(in PSWorker) lr_adv=0.004, # if no better after 150 batches # amount of batch to feed to NN at once, fetched from buffer randomly. mini_batch_size_adv=10000, # 512 init_adv_model="last", # last, random game_cls=DiscretizedNLHoldem, # PLO or DiscretizedNLHoldem env_bldr_cls=VanillaEnvBuilder, agent_bet_set=bet_sets.PL_2, n_seats=2, start_chips=10000, # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR # EvalAgentDeepCFR.EVAL_MODE_AVRG_NET ), # enables simplified obs. Default works also for 3+ players use_simplified_headsup_obs=True, log_verbose=True, lbr_args=LBRArgs( lbr_bet_set=bet_sets.PL_2, n_lbr_hands_per_seat=1, lbr_check_to_round=Poker.TURN, # recommended to set to Poker.TURN for 4-round games. n_parallel_lbr_workers=1, use_gpu_for_batch_eval=False, DISTRIBUTED=False, ), ), eval_methods={ "lbr": 99, # lbr, br, h2h }, n_iterations=64)
ctrl = Driver( t_prof=TrainingProfile( name="SD-CFR_LEDUC_BUF_500", nn_type="feedforward", max_buffer_size_adv=1e6, max_buffer_size_avrg=1e6, eval_agent_export_freq=999999, n_traversals_per_iter=1500, n_batches_adv_training=750, n_batches_avrg_training=5000, n_merge_and_table_layer_units_adv=64, n_merge_and_table_layer_units_avrg=64, n_units_final_adv=64, n_units_final_avrg=64, mini_batch_size_adv=2048, mini_batch_size_avrg=2048, init_adv_model="last", init_avrg_model="random", use_pre_layers_adv=False, use_pre_layers_avrg=False, eval_agent_max_strat_buf_size=500, game_cls=StandardLeduc, eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR ), DISTRIBUTED=False, log_verbose=False, ), eval_methods={ "br": 15, }, n_iterations=None)
ctrl = Driver( t_prof=TrainingProfile( name="SD-CFR_LEDUC_EXAMPLE", nn_type="feedforward", max_buffer_size_adv=3e6, eval_agent_export_freq=20, # export API to play against the agent n_traversals_per_iter=1500, n_batches_adv_training=750, n_batches_avrg_training=2000, n_merge_and_table_layer_units_adv=64, n_merge_and_table_layer_units_avrg=64, n_units_final_adv=64, n_units_final_avrg=64, mini_batch_size_adv=2048, mini_batch_size_avrg=2048, init_adv_model="last", init_avrg_model="last", use_pre_layers_adv=False, use_pre_layers_avrg=False, game_cls=StandardLeduc, # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR EvalAgentDeepCFR.EVAL_MODE_AVRG_NET, # Deep CFR ), DISTRIBUTED=False, ), eval_methods={ "br": 3, }, n_iterations=None)
ctrl = Driver( t_prof=TrainingProfile( name="EXPERIMENT_SD-CFR_vs_Deep-CFR_FHP", nn_type="feedforward", # We also support RNNs, but the paper uses FF DISTRIBUTED=True, CLUSTER=False, n_learner_actor_workers=20, # 20 workers # regulate exports export_each_net=False, checkpoint_freq=99999999, eval_agent_export_freq= 1, # produces around 15GB over 150 iterations! n_actions_traverser_samples=3, # = external sampling in FHP n_traversals_per_iter=15000, n_batches_adv_training=4000, mini_batch_size_adv=512, # *20=10240 init_adv_model="random", use_pre_layers_adv=True, n_cards_state_units_adv=192, n_merge_and_table_layer_units_adv=64, n_units_final_adv=64, max_buffer_size_adv=2e6, # *20 LAs = 40M lr_adv=0.001, lr_patience_adv=99999999, # No lr decay n_batches_avrg_training=20000, mini_batch_size_avrg=1024, # *20=20480 init_avrg_model="random", use_pre_layers_avrg=True, n_cards_state_units_avrg=192, n_merge_and_table_layer_units_avrg=64, n_units_final_avrg=64, max_buffer_size_avrg=2e6, lr_avrg=0.001, lr_patience_avrg=99999999, # No lr decay # With the H2H evaluator, these two are evaluated against eachother. eval_modes_of_algo=(EvalAgentDeepCFR.EVAL_MODE_AVRG_NET, EvalAgentDeepCFR.EVAL_MODE_SINGLE), log_verbose=True, game_cls=Flop5Holdem, # enables simplified obs. Default works also for 3+ players use_simplified_headsup_obs=True, h2h_args=H2HArgs( n_hands= 1500000, # this is per seat; so in total 3M hands per eval ), ), # Evaluate Head-to-Head every 15 iterations of both players (= every 30 alternating iterations) eval_methods={"h2h": 15}, # 150 = 300 when 2 viewing alternating iterations as 2 (as usually done). # This repo implements alternating iters as a single iter, which is why this says 150. n_iterations=150, )
ctrl = Driver( t_prof=TrainingProfile( name="BIGLEDUC_EXPLOITABILITY", DISTRIBUTED=True, n_learner_actor_workers=11, eval_agent_export_freq=9999999, # Don't export nn_type="feedforward", max_buffer_size_adv=3.636e5, # 364k * 11 = ~4M max_buffer_size_avrg=3.636e5, # 364k * 11 = ~4M # longer action sequences than FHP -> more samples/iter because external sampling. n_traversals_per_iter=800, # 800 * 11 = 8,800 n_batches_adv_training=1200, n_batches_avrg_training=10000, # trained far more than necessary n_merge_and_table_layer_units_adv=64, n_merge_and_table_layer_units_avrg=64, n_units_final_adv=64, n_units_final_avrg=64, n_cards_state_units_adv=64, n_cards_state_units_avrg=64, mini_batch_size_adv=256, # 256 * 11 = 2,816 mini_batch_size_avrg=512, # 512 * 11 = 5,632 init_adv_model= "last", # warm start neural weights with init from last iter init_avrg_model="random", use_pre_layers_adv=True, use_pre_layers_avrg=True, game_cls=BigLeduc, # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR EvalAgentDeepCFR.EVAL_MODE_AVRG_NET, # Deep CFR ), h2h_args=H2HArgs(n_hands=500000, ), log_verbose=False, ), eval_methods={ "br": 15, }, n_iterations=None)
ctrl = Driver( t_prof=TrainingProfile( name="Hanul_EXAMPLE", nn_type="recurrent", max_buffer_size_adv=3e6, eval_agent_export_freq=20, # export API to play against the agent n_traversals_per_iter=200, n_batches_adv_training=8, n_batches_avrg_training=2000, n_merge_and_table_layer_units_adv=64, n_merge_and_table_layer_units_avrg=64, n_units_final_adv=64, n_units_final_avrg=64, mini_batch_size_adv=16, mini_batch_size_avrg=16, init_adv_model="last", init_avrg_model="last", use_pre_layers_adv=False, use_pre_layers_avrg=False, game_cls=DiscretizedNLHoldem, lbr_args=LBRArgs( lbr_bet_set=bet_sets.B_5, n_lbr_hands_per_seat=80, lbr_check_to_round=Poker.TURN, n_parallel_lbr_workers=N_LBR_WORKERS, use_gpu_for_batch_eval=False, DISTRIBUTED=True, ), # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR EvalAgentDeepCFR.EVAL_MODE_AVRG_NET, # Deep CFR ), DISTRIBUTED=False, ), eval_methods={ "lbr": 5, }, n_iterations=300)
ctrl = Driver( t_prof=TrainingProfile( name="MO_HULH_1", nn_type="feedforward", # We also support RNNs, but the paper uses FF DISTRIBUTED=True, CLUSTER=False, n_learner_actor_workers=40, # 20 workers # regulate exports export_each_net=False, checkpoint_freq=99999999, eval_agent_export_freq=5, n_actions_traverser_samples=3, # = external sampling in FHP n_traversals_per_iter=500, n_batches_adv_training=2000, mini_batch_size_adv=512, # *20=10240 init_adv_model="random", use_pre_layers_adv=True, n_cards_state_units_adv=192, n_merge_and_table_layer_units_adv=64, n_units_final_adv=64, max_buffer_size_adv=2e6, # *20 LAs = 40M lr_adv=0.001, lr_patience_adv=99999999, # No lr decay # With the H2H evaluator, these two are evaluated against eachother. eval_modes_of_algo=(EvalAgentDeepCFR.EVAL_MODE_SINGLE, ), log_verbose=True, game_cls=LimitHoldem, # enables simplified obs. Default works also for 3+ players use_simplified_headsup_obs=True, ), eval_methods={}, n_iterations=50, )
ctrl = Driver( t_prof=TrainingProfile( name="LEDUC_EXPLOITABILITY", nn_type="feedforward", max_buffer_size_adv=1e6, max_buffer_size_avrg=1e6, eval_agent_export_freq=999999, # Don't export n_traversals_per_iter=1500, n_batches_adv_training=750, n_batches_avrg_training=5000, n_merge_and_table_layer_units_adv=64, n_merge_and_table_layer_units_avrg=64, n_units_final_adv=64, n_units_final_avrg=64, mini_batch_size_adv=2048, mini_batch_size_avrg=2048, init_adv_model= "last", # warm start neural weights with init from last iter init_avrg_model="random", use_pre_layers_adv=False, # shallower nets use_pre_layers_avrg=False, # shallower nets game_cls=StandardLeduc, # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR EvalAgentDeepCFR.EVAL_MODE_AVRG_NET, # Deep CFR ), DISTRIBUTED=False, log_verbose=False, ), eval_methods={ "br": 15, }, n_iterations=None)
log_verbose=True, rl_br_args=RLBRArgs(rlbr_bet_set=None, #n_hands_each_seat=5*i, n_workers=5, # Training DISTRIBUTED=True, n_iterations=1000*(i+1), play_n_games_per_iter=50, # The DDQN batch_size=512, ) #lbr_args = LBRArgs() ) return TP for i in range(n_iter): TP = new_tp(i) ctrl = Driver(t_prof=TP, eval_methods={'br': 1, 'rlbr': 1}, n_iterations=driver_iterations) ctrl.run() dfs = logs_util.logs_to_dfs(exp_name=exp_name, iter_number=driver_iterations) # extract a measure for the hp RLBR_df = dfs['MO_LEDUC_EVAL SINGLE_stack_13: RL-BR Total'] BR_df = dfs['MO_LEDUC_EVAL SINGLE_stack_13: BR Total'] diff = RLBR_df - BR_df # append to a dict hp -> measure hp_measure[5 * i] = (diff.mean()['Evaluation/MA_per_G'], diff.std()['Evaluation/MA_per_G']) pdb.set_trace()
# agent_prof.module_args['lbr'] = LBRArgs() # lbr_chief = Chief(t_prof=agent_prof) # eval_master = LBRMaster(t_prof=agent_to_eval.t_prof, # chief_handle=lbr_chief) # num_workers = 3 # LBR_workers = [LBRWorker(t_prof=agent_prof, chief_handle=lbr_chief, eval_agent_cls=EvalAgentDeepCFR) for _ in range(num_workers)] # eval_master.set_worker_handles(*LBR_workers) # # # # #eval_master.evaluate(0) #-------- Driver Approach from DeepCFR.workers.driver.Driver import Driver from PokerRL.eval.lbr.LBRArgs import LBRArgs agent_prof.module_args['lbr'] = LBRArgs() ctrl = Driver(agent_prof, eval_methods={'lbr': 1}) ctrl.chief_handle._strategy_buffers = agent_to_eval._strategy_buffers w, info = ctrl.chief_handle._pull_current_eval_strategy() ctrl.eval_masters['lbr'][0]._eval_agent = agent_to_eval ctrl.eval_masters['lbr'][0].weights_for_eval_agent = w ctrl.eval_masters['lbr'][0].evaluate(0) # ctrl.eval_masters['br'][0]._eval_agent = agent_to_eval # ctrl.eval_masters['br'][0].evaluate(0) # agent_prof.module_args['lbr'] = LBRArgs() # ctrl = Driver(agent_prof, eval_methods={'lbr': 1}) # # ctrl.eval_masters['lbr'][0]._eval_agent = agent_to_eval # ctrl.eval_masters['lbr'][0].evaluate(0)
ctrl = Driver( t_prof=TrainingProfile( name= "PLO_3m_7.5mX14-b5000-last-patience350-Leaky-lr0.004-dense_residual", nn_type="dense_residual", DISTRIBUTED=True, CLUSTER=False, n_learner_actor_workers=14, # 14 workers max_buffer_size_adv=3000000, # 3e6 export_each_net=False, # path_strategy_nets="", checkpoint_freq=9999, # produces A SHITLOAD of Gbs! eval_agent_export_freq=1, # How many actions out of all legal on current step to branch randomly # = action breadth limit # 3 is the default, 4 is the current max for b_2 n_actions_traverser_samples=4, # number of traversals equal to the number of entries that will be added # to adv buffer n_traversals_per_iter=150000, # number of mini_batch fetches and model updates on each iteration n_batches_adv_training=1500, max_n_las_sync_simultaneously=20, use_pre_layers_adv=True, n_cards_state_units_adv=192, n_merge_and_table_layer_units_adv=64, # 64 n_units_final_adv=64, # 64 lr_patience_adv=350, # decrease by a factor 0.5(in PSWorker) lr_adv=0.004, # size of batch to feed to NN at once, fetched from buffer randomly. mini_batch_size_adv=5000, init_adv_model="last", # last, random game_cls=PLO, # PLO or DiscretizedNLHoldem env_bldr_cls=VanillaEnvBuilder, agent_bet_set=bet_sets.PL_2, n_seats=2, start_chips=10000, # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR ), # enables simplified obs. Default works also for 3+ players use_simplified_headsup_obs=True, log_verbose=True, lbr_args=LBRArgs( lbr_bet_set=bet_sets.PL_2, n_lbr_hands_per_seat=1, lbr_check_to_round=Poker.TURN, # recommended to set to Poker.TURN for 4-round games. n_parallel_lbr_workers=1, use_gpu_for_batch_eval=False, DISTRIBUTED=False, ), ), eval_methods={ "": 99, # lbr, br, h2h }, n_iterations=64)
if __name__ == '__main__': ctrl = Driver( t_prof=TrainingProfile( name="MO_LEDUC_EXPLOITABILITY", nn_type="feedforward", n_learner_actor_workers=5, eval_agent_export_freq=3, checkpoint_freq=1, max_buffer_size_adv=1e6, n_traversals_per_iter=500, n_batches_adv_training=250, mini_batch_size_adv=2048, game_cls=StandardLeduc, n_units_final_adv=64, n_merge_and_table_layer_units_adv=64, init_adv_model= "random", # warm start neural weights with init from last iter use_pre_layers_adv=False, # shallower nets use_pre_layers_avrg=False, # shallower nets # You can specify one or both modes. Choosing both is useful to compare them. eval_modes_of_algo=( EvalAgentDeepCFR.EVAL_MODE_SINGLE, # SD-CFR ), DISTRIBUTED=False, log_verbose=True, ), eval_methods={}, n_iterations=4, name_to_import='MO_LEDUC_EXPLOITABILITY', iteration_to_import=4) ctrl.run()