Esempio n. 1
0
    def __init__(
        self,

        # --- general
        name,
        log_export_freq=1000,
        checkpoint_freq=99999999,
        eval_agent_export_freq=99999999,

        # --- Computing
        path_data=None,
        local_crayon_server_docker_address="localhost",
        device_inference="cpu",
        device_parameter_server="cpu",
        n_learner_actor_workers=8,
        max_n_las_sync_simultaneously=100,
        DISTRIBUTED=False,
        CLUSTER=False,
        DEBUGGING=False,
        VERBOSE=True,

        # --- env
        game_cls=StandardLeduc,
        n_seats=2,
        use_simplified_headsup_obs=True,
        start_chips=None,
        agent_bet_set=bet_sets.B_2,
        stack_randomization_range=(0, 0),
        uniform_action_interpolation=False,

        # --- Evaluation
        eval_modes_of_algo=(EvalAgentNFSP.EVAL_MODE_AVG, ),
        eval_stack_sizes=None,

        # --- NFSP
        nn_type="feedforward",
        linear=False,
        sampling=None,  # "adam" or "clean" for enhanced variants

        # Original NFSP also adds epsilon-exploration actions to the averaging buffer.
        add_random_actions_to_avg_buffer=True,
        n_br_updates_per_iter=2,
        n_avg_updates_per_iter=2,
        target_net_update_freq=300,  # every N neural net updates. Not every N global iters, episodes, or steps
        cir_buf_size_each_la=2e5,
        res_buf_size_each_la=2e6,  # the more the better to infinity
        min_prob_add_res_buf=0.0,  # 0.0 =  vanilla reservoir; >0 exponential averaging.
        eps_start=0.06,
        eps_const=0.01,
        eps_exponent=0.5,
        eps_min=0.0,
        antic_start=0.1,
        antic_const=0.01,
        antic_exponent=0.5,
        antic_min=0.1,  # No decay

        # For clean sampling
        constant_eps_expl=0.1,

        # --- Training.
        n_steps_avg_per_iter_per_la=None,
        n_steps_br_per_iter_per_la=None,
        n_steps_per_iter_per_la=None,
        training_multiplier_iter_0=1,  # In iter 0 the BR net is clueless, but adds to res_buf. -> "pretrain"

        # --- Q-Learning Hyperparameters
        mini_batch_size_br_per_la=128,
        dim_br=64,
        normalize_last_layer_flat=True,
        lr_br=0.1,
        deep_br=True,  # True -> Use deep multi-branch network; False -> Use shallow net
        grad_norm_clipping_br=1.0,
        optimizer_br="sgd",
        loss_br="mse",

        # --- Avg Network Hyperparameters
        mini_batch_size_avg_per_la=128,
        dim_avg=64,
        lr_avg=0.005,
        deep_avg=True,  # True -> Use deep multi-branch network; False -> Use shallow net
        grad_norm_clipping_avg=1.0,
        optimizer_avg="sgd",
        loss_avg="weighted_ce",

        # Option
        lbr_args=None,
        rlbr_args=None,
    ):
        if nn_type == "recurrent":
            raise NotImplementedError

        elif nn_type == "feedforward":
            env_bldr_cls = FlatLimitPokerEnvBuilder

            mpm_args_br = MPMArgsFLAT(dim=dim_br,
                                      normalize=normalize_last_layer_flat,
                                      deep=deep_br)
            mpm_args_avg = MPMArgsFLAT(dim=dim_avg,
                                       normalize=normalize_last_layer_flat,
                                       deep=deep_avg)

        else:
            raise ValueError(nn_type)

        super().__init__(
            name=name,
            log_verbose=VERBOSE,
            log_export_freq=log_export_freq,
            checkpoint_freq=checkpoint_freq,
            eval_agent_export_freq=eval_agent_export_freq,
            path_data=path_data,
            game_cls=game_cls,
            env_bldr_cls=env_bldr_cls,
            start_chips=start_chips,
            eval_modes_of_algo=eval_modes_of_algo,
            eval_stack_sizes=eval_stack_sizes,
            DEBUGGING=DEBUGGING,
            DISTRIBUTED=DISTRIBUTED,
            CLUSTER=CLUSTER,
            device_inference=device_inference,
            local_crayon_server_docker_address=
            local_crayon_server_docker_address,
            module_args={
                "ddqn":
                DDQNArgs(
                    q_args=DuelingQArgs(
                        mpm_args=mpm_args_br,
                        n_units_final=dim_br,
                    ),
                    cir_buf_size=int(cir_buf_size_each_la),
                    batch_size=mini_batch_size_br_per_la,
                    target_net_update_freq=target_net_update_freq,
                    optim_str=optimizer_br,
                    loss_str=loss_br,
                    lr=lr_br,
                    eps_start=eps_start,
                    eps_const=eps_const,
                    eps_exponent=eps_exponent,
                    eps_min=eps_min,
                    grad_norm_clipping=grad_norm_clipping_br,
                ),
                "avg":
                AvgWrapperArgs(
                    avg_net_args=AvrgNetArgs(
                        mpm_args=mpm_args_avg,
                        n_units_final=dim_avg,
                    ),
                    batch_size=mini_batch_size_avg_per_la,
                    res_buf_size=int(res_buf_size_each_la),
                    min_prob_add_res_buf=min_prob_add_res_buf,
                    loss_str=loss_avg,
                    optim_str=optimizer_avg,
                    lr=lr_avg,
                    grad_norm_clipping=grad_norm_clipping_avg,
                ),
                "env":
                game_cls.ARGS_CLS(
                    n_seats=n_seats,
                    starting_stack_sizes_list=[
                        start_chips for _ in range(n_seats)
                    ],
                    stack_randomization_range=stack_randomization_range,
                    use_simplified_headsup_obs=use_simplified_headsup_obs,
                    uniform_action_interpolation=uniform_action_interpolation,

                    # Set up in a way that just ignores this if not DiscretizedLimit
                    bet_sizes_list_as_n_chips=copy.deepcopy(agent_bet_set),

                    # Set up in a way that just ignores this if not Discretized
                    bet_sizes_list_as_frac_of_pot=copy.deepcopy(agent_bet_set),
                ),
                "lbr":
                lbr_args,
                "rlbr":
                rlbr_args,
            },
            log_memory=False,
        )

        # ____________________________________________________ NFSP ____________________________________________________
        self.nn_type = nn_type
        self.n_br_updates_per_iter = int(n_br_updates_per_iter)
        self.n_avg_updates_per_iter = int(n_avg_updates_per_iter)
        self.antic_start = antic_start
        self.antic_const = antic_const
        self.antic_exponent = antic_exponent
        self.antic_min = antic_min
        self.linear = linear
        self.sampling = sampling
        self.add_random_actions_to_buffer = add_random_actions_to_avg_buffer
        self.training_multiplier_iter_0 = int(training_multiplier_iter_0)
        self.constant_eps_expl = constant_eps_expl

        if sampling == "adam" or sampling == "clean":
            assert n_steps_per_iter_per_la is None
            assert n_steps_br_per_iter_per_la is not None
            assert n_steps_avg_per_iter_per_la is not None
            self.n_steps_br_per_iter_per_la = int(n_steps_br_per_iter_per_la)
            self.n_steps_avg_per_iter_per_la = int(n_steps_avg_per_iter_per_la)
        else:
            assert n_steps_per_iter_per_la is not None
            assert n_steps_br_per_iter_per_la is None
            assert n_steps_avg_per_iter_per_la is None
            self.n_steps_per_iter_per_la = int(n_steps_per_iter_per_la)

        if DISTRIBUTED or CLUSTER:
            self.n_learner_actors = int(n_learner_actor_workers)
        else:
            self.n_learner_actors = 1

        self.max_n_las_sync_simultaneously = int(max_n_las_sync_simultaneously)

        assert isinstance(
            device_parameter_server,
            str), "Please pass a string (either 'cpu' or 'cuda')!"
        self.device_parameter_server = torch.device(device_parameter_server)
Esempio n. 2
0
    def __init__(
        self,

        # ------ General
        name="",
        log_verbose=False,
        log_memory=False,
        log_export_freq=1,
        checkpoint_freq=99999999,
        eval_agent_export_freq=999999999,
        n_learner_actor_workers=8,
        max_n_las_sync_simultaneously=10,
        nn_type="feedforward",  # "recurrent" or "feedforward"

        # ------ Computing
        path_data=None,
        local_crayon_server_docker_address="localhost",
        device_inference="cpu",
        device_training="cpu",
        device_parameter_server="cpu",
        DISTRIBUTED=False,
        CLUSTER=False,
        DEBUGGING=False,

        # ------ Env
        game_cls=DiscretizedNLLeduc,
        n_seats=2,
        agent_bet_set=bet_sets.B_2,
        start_chips=None,
        chip_randomness=(0, 0),
        uniform_action_interpolation=False,
        use_simplified_headsup_obs=True,

        # ------ Evaluation
        eval_modes_of_algo=(EvalAgentDeepCFR.EVAL_MODE_SINGLE, ),
        eval_stack_sizes=None,

        # ------ General Deep CFR params
        n_traversals_per_iter=30000,
        iter_weighting_exponent=1.0,
        n_actions_traverser_samples=3,
        sampler="mo",
        turn_off_baseline=False,  # Only for VR-OS
        os_eps=1,
        periodic_restart=1,

        # --- Baseline Hyperparameters
        max_buffer_size_baseline=2e5,
        batch_size_baseline=512,
        n_batches_per_iter_baseline=300,
        dim_baseline=64,
        deep_baseline=True,
        normalize_last_layer_FLAT_baseline=True,

        # --- Adv Hyperparameters
        n_batches_adv_training=5000,
        init_adv_model="random",
        mini_batch_size_adv=2048,
        dim_adv=64,
        deep_adv=True,
        optimizer_adv="adam",
        loss_adv="weighted_mse",
        lr_adv=0.001,
        grad_norm_clipping_adv=1.0,
        lr_patience_adv=999999999,
        normalize_last_layer_FLAT_adv=True,
        max_buffer_size_adv=2e6,

        # ------ SPECIFIC TO AVRG NET
        n_batches_avrg_training=15000,
        init_avrg_model="random",
        dim_avrg=64,
        deep_avrg=True,
        mini_batch_size_avrg=2048,
        loss_avrg="weighted_mse",
        optimizer_avrg="adam",
        lr_avrg=0.001,
        grad_norm_clipping_avrg=1.0,
        lr_patience_avrg=999999999,
        normalize_last_layer_FLAT_avrg=True,
        max_buffer_size_avrg=2e6,

        # ------ SPECIFIC TO SINGLE
        export_each_net=False,
        eval_agent_max_strat_buf_size=None,

        # ------ Optional
        lbr_args=None,
        rlbr_args=None,
        h2h_args=None,
    ):
        if nn_type == "feedforward":
            env_bldr_cls = FlatLimitPokerEnvBuilder

            from PokerRL.rl.neural.MainPokerModuleFLAT import MPMArgsFLAT

            mpm_args_adv = MPMArgsFLAT(deep=deep_adv,
                                       dim=dim_adv,
                                       normalize=normalize_last_layer_FLAT_adv)
            mpm_args_baseline = MPMArgsFLAT_Baseline(
                deep=deep_baseline,
                dim=dim_baseline,
                normalize=normalize_last_layer_FLAT_baseline)
            mpm_args_avrg = MPMArgsFLAT(
                deep=deep_avrg,
                dim=dim_avrg,
                normalize=normalize_last_layer_FLAT_avrg)

        else:
            raise ValueError(nn_type)

        super().__init__(
            name=name,
            log_verbose=log_verbose,
            log_export_freq=log_export_freq,
            checkpoint_freq=checkpoint_freq,
            eval_agent_export_freq=eval_agent_export_freq,
            path_data=path_data,
            game_cls=game_cls,
            env_bldr_cls=env_bldr_cls,
            start_chips=start_chips,
            eval_modes_of_algo=eval_modes_of_algo,
            eval_stack_sizes=eval_stack_sizes,
            DEBUGGING=DEBUGGING,
            DISTRIBUTED=DISTRIBUTED,
            CLUSTER=CLUSTER,
            device_inference=device_inference,
            local_crayon_server_docker_address=
            local_crayon_server_docker_address,
            module_args={
                "adv_training":
                AdvTrainingArgs(
                    adv_net_args=DuelingQArgs(mpm_args=mpm_args_adv,
                                              n_units_final=dim_adv),
                    n_batches_adv_training=n_batches_adv_training,
                    init_adv_model=init_adv_model,
                    batch_size=mini_batch_size_adv,
                    optim_str=optimizer_adv,
                    loss_str=loss_adv,
                    lr=lr_adv,
                    grad_norm_clipping=grad_norm_clipping_adv,
                    device_training=device_training,
                    max_buffer_size=max_buffer_size_adv,
                    lr_patience=lr_patience_adv,
                ),
                "avrg_training":
                AvrgTrainingArgs(
                    avrg_net_args=AvrgNetArgs(
                        mpm_args=mpm_args_avrg,
                        n_units_final=dim_avrg,
                    ),
                    n_batches_avrg_training=n_batches_avrg_training,
                    init_avrg_model=init_avrg_model,
                    batch_size=mini_batch_size_avrg,
                    loss_str=loss_avrg,
                    optim_str=optimizer_avrg,
                    lr=lr_avrg,
                    grad_norm_clipping=grad_norm_clipping_avrg,
                    device_training=device_training,
                    max_buffer_size=max_buffer_size_avrg,
                    lr_patience=lr_patience_avrg,
                ),
                "env":
                game_cls.ARGS_CLS(
                    n_seats=n_seats,
                    starting_stack_sizes_list=[
                        start_chips for _ in range(n_seats)
                    ],
                    bet_sizes_list_as_frac_of_pot=copy.deepcopy(agent_bet_set),
                    stack_randomization_range=chip_randomness,
                    use_simplified_headsup_obs=use_simplified_headsup_obs,
                    uniform_action_interpolation=uniform_action_interpolation),
                "mccfr_baseline":
                BaselineArgs(
                    q_net_args=DuelingQArgs(
                        mpm_args=mpm_args_baseline,
                        n_units_final=dim_baseline,
                    ),
                    max_buffer_size=max_buffer_size_baseline,
                    batch_size=batch_size_baseline,
                    n_batches_per_iter_baseline=n_batches_per_iter_baseline,
                ),
                "lbr":
                lbr_args,
                "rlbr":
                rlbr_args,
                "h2h":
                h2h_args,
            },
            log_memory=log_memory,
        )

        self.nn_type = nn_type
        self.n_traversals_per_iter = int(n_traversals_per_iter)
        self.iter_weighting_exponent = iter_weighting_exponent
        self.sampler = sampler
        self.os_eps = os_eps
        self.periodic_restart = periodic_restart
        self.turn_off_baseline = turn_off_baseline
        self.n_actions_traverser_samples = n_actions_traverser_samples

        # SINGLE
        self.export_each_net = export_each_net
        self.eval_agent_max_strat_buf_size = eval_agent_max_strat_buf_size

        # Different for dist and local
        if DISTRIBUTED or CLUSTER:
            print("Running with ", n_learner_actor_workers,
                  "LearnerActor Workers.")
            self.n_learner_actors = n_learner_actor_workers
        else:
            self.n_learner_actors = 1
        self.max_n_las_sync_simultaneously = max_n_las_sync_simultaneously

        assert isinstance(
            device_parameter_server,
            str), "Please pass a string (either 'cpu' or 'cuda')!"
        self.device_parameter_server = torch.device(device_parameter_server)
Esempio n. 3
0
    def __init__(
        self,

        # ------ General
        name="",
        log_verbose=True,
        log_export_freq=1,
        checkpoint_freq=99999999,
        eval_agent_export_freq=999999999,
        n_learner_actor_workers=8,
        max_n_las_sync_simultaneously=10,
        nn_type="feedforward",  # "recurrent" or "feedforward"

        # ------ Computing
        path_data=None,
        local_crayon_server_docker_address="localhost",
        device_inference="cpu",
        device_training="cpu",
        device_parameter_server="cpu",
        DISTRIBUTED=False,
        CLUSTER=False,
        DEBUGGING=False,

        # ------ Env
        game_cls=DiscretizedNLLeduc,
        env_bldr_cls=FlatLimitPokerEnvBuilder,
        n_seats=2,
        agent_bet_set=bet_sets.B_2,
        start_chips=None,
        chip_randomness=(0, 0),
        uniform_action_interpolation=False,
        use_simplified_headsup_obs=True,

        # ------ Evaluation
        eval_modes_of_algo=(EvalAgentDeepCFR.EVAL_MODE_SINGLE, ),
        eval_stack_sizes=None,

        # ------ General Deep CFR params
        n_traversals_per_iter=30000,
        online=False,
        iter_weighting_exponent=1.0,
        n_actions_traverser_samples=3,
        sampler="mo",

        # --- Adv Hyperparameters
        n_batches_adv_training=5000,
        init_adv_model="random",
        rnn_cls_str_adv="lstm",
        rnn_units_adv=128,
        rnn_stack_adv=1,
        dropout_adv=0.0,
        use_pre_layers_adv=False,
        n_cards_state_units_adv=96,
        n_merge_and_table_layer_units_adv=32,
        n_units_final_adv=64,
        mini_batch_size_adv=4096,
        n_mini_batches_per_la_per_update_adv=1,
        optimizer_adv="adam",
        loss_adv="weighted_mse",
        lr_adv=0.001,
        grad_norm_clipping_adv=10.0,
        lr_patience_adv=999999999,
        normalize_last_layer_FLAT_adv=True,
        max_buffer_size_adv=3e6,

        # ------ SPECIFIC TO AVRG NET
        n_batches_avrg_training=15000,
        init_avrg_model="random",
        rnn_cls_str_avrg="lstm",
        rnn_units_avrg=128,
        rnn_stack_avrg=1,
        dropout_avrg=0.0,
        use_pre_layers_avrg=False,
        n_cards_state_units_avrg=96,
        n_merge_and_table_layer_units_avrg=32,
        n_units_final_avrg=64,
        mini_batch_size_avrg=4096,
        n_mini_batches_per_la_per_update_avrg=1,
        loss_avrg="weighted_mse",
        optimizer_avrg="adam",
        lr_avrg=0.001,
        grad_norm_clipping_avrg=10.0,
        lr_patience_avrg=999999999,
        normalize_last_layer_FLAT_avrg=True,
        max_buffer_size_avrg=3e6,

        # ------ SPECIFIC TO SINGLE
        export_each_net=False,
        eval_agent_max_strat_buf_size=None,

        # ------ Optional
        lbr_args=None,
        rl_br_args=None,
        h2h_args=None,
    ):
        print(" ************************** Initing args for: ", name,
              "  **************************")

        if nn_type == "recurrent":
            from PokerRL.rl.neural.MainPokerModuleRNN import MPMArgsRNN

            # env_bldr_cls = HistoryEnvBuilder

            mpm_args_adv = MPMArgsRNN(
                rnn_cls_str=rnn_cls_str_adv,
                rnn_units=rnn_units_adv,
                rnn_stack=rnn_stack_adv,
                rnn_dropout=dropout_adv,
                use_pre_layers=use_pre_layers_adv,
                n_cards_state_units=n_cards_state_units_adv,
                n_merge_and_table_layer_units=n_merge_and_table_layer_units_adv
            )
            mpm_args_avrg = MPMArgsRNN(
                rnn_cls_str=rnn_cls_str_avrg,
                rnn_units=rnn_units_avrg,
                rnn_stack=rnn_stack_avrg,
                rnn_dropout=dropout_avrg,
                use_pre_layers=use_pre_layers_avrg,
                n_cards_state_units=n_cards_state_units_avrg,
                n_merge_and_table_layer_units=n_merge_and_table_layer_units_avrg
            )

        elif nn_type == "feedforward":
            from PokerRL.rl.neural.MainPokerModuleFLAT import MPMArgsFLAT

            mpm_args_adv = MPMArgsFLAT(
                use_pre_layers=use_pre_layers_adv,
                card_block_units=n_cards_state_units_adv,
                other_units=n_merge_and_table_layer_units_adv,
                normalize=normalize_last_layer_FLAT_adv)
            mpm_args_avrg = MPMArgsFLAT(
                use_pre_layers=use_pre_layers_avrg,
                card_block_units=n_cards_state_units_avrg,
                other_units=n_merge_and_table_layer_units_avrg,
                normalize=normalize_last_layer_FLAT_avrg)

        elif nn_type == "convolutional":
            from PokerRL.rl.neural.MainPokerModuleCNN import MPMArgsCNN

            mpm_args_adv = MPMArgsCNN(
                use_pre_layers=use_pre_layers_adv,
                card_block_units=n_cards_state_units_adv,
                other_units=n_merge_and_table_layer_units_adv,
                normalize=normalize_last_layer_FLAT_adv,
                dropout=dropout_adv)
            mpm_args_avrg = MPMArgsCNN(
                use_pre_layers=use_pre_layers_avrg,
                card_block_units=n_cards_state_units_avrg,
                other_units=n_merge_and_table_layer_units_avrg,
                normalize=normalize_last_layer_FLAT_avrg,
                dropout=dropout_avrg)
        elif nn_type == "dense_residual":
            from PokerRL.rl.neural.MainPokerModuleFLAT2 import MPMArgsFLAT2

            mpm_args_adv = MPMArgsFLAT2(
                use_pre_layers=use_pre_layers_adv,
                card_block_units=n_cards_state_units_adv,
                other_units=n_merge_and_table_layer_units_adv,
                normalize=normalize_last_layer_FLAT_adv,
                dropout=dropout_adv)
            mpm_args_avrg = MPMArgsFLAT2(
                use_pre_layers=use_pre_layers_avrg,
                card_block_units=n_cards_state_units_avrg,
                other_units=n_merge_and_table_layer_units_avrg,
                normalize=normalize_last_layer_FLAT_avrg,
                dropout=dropout_avrg)

        else:
            raise ValueError(nn_type)

        super().__init__(
            name=name,
            log_verbose=log_verbose,
            log_export_freq=log_export_freq,
            checkpoint_freq=checkpoint_freq,
            eval_agent_export_freq=eval_agent_export_freq,
            path_data=path_data,
            game_cls=game_cls,
            env_bldr_cls=env_bldr_cls,
            start_chips=start_chips,
            eval_modes_of_algo=eval_modes_of_algo,
            eval_stack_sizes=eval_stack_sizes,
            DEBUGGING=DEBUGGING,
            DISTRIBUTED=DISTRIBUTED,
            CLUSTER=CLUSTER,
            device_inference=device_inference,
            local_crayon_server_docker_address=
            local_crayon_server_docker_address,
            module_args={
                "adv_training":
                AdvTrainingArgs(
                    adv_net_args=DuelingQArgs(
                        mpm_args=mpm_args_adv,
                        n_units_final=n_units_final_adv,
                    ),
                    n_batches_adv_training=n_batches_adv_training,
                    init_adv_model=init_adv_model,
                    batch_size=mini_batch_size_adv,
                    n_mini_batches_per_update=
                    n_mini_batches_per_la_per_update_adv,
                    optim_str=optimizer_adv,
                    loss_str=loss_adv,
                    lr=lr_adv,
                    grad_norm_clipping=grad_norm_clipping_adv,
                    device_training=device_training,
                    max_buffer_size=max_buffer_size_adv,
                    lr_patience=lr_patience_adv,
                ),
                "avrg_training":
                AvrgTrainingArgs(
                    avrg_net_args=AvrgNetArgs(
                        mpm_args=mpm_args_avrg,
                        n_units_final=n_units_final_avrg,
                    ),
                    n_batches_avrg_training=n_batches_avrg_training,
                    init_avrg_model=init_avrg_model,
                    batch_size=mini_batch_size_avrg,
                    n_mini_batches_per_update=
                    n_mini_batches_per_la_per_update_avrg,
                    loss_str=loss_avrg,
                    optim_str=optimizer_avrg,
                    lr=lr_avrg,
                    grad_norm_clipping=grad_norm_clipping_avrg,
                    device_training=device_training,
                    max_buffer_size=max_buffer_size_avrg,
                    lr_patience=lr_patience_avrg,
                ),
                "env":
                game_cls.ARGS_CLS(
                    n_seats=n_seats,
                    starting_stack_sizes_list=[
                        start_chips for _ in range(n_seats)
                    ],
                    bet_sizes_list_as_frac_of_pot=copy.deepcopy(agent_bet_set),
                    stack_randomization_range=chip_randomness,
                    use_simplified_headsup_obs=use_simplified_headsup_obs,
                    uniform_action_interpolation=uniform_action_interpolation),
                "lbr":
                lbr_args,
                "rlbr":
                rl_br_args,
                "h2h":
                h2h_args,
            })

        self.nn_type = nn_type
        self.online = online
        self.n_traversals_per_iter = n_traversals_per_iter
        self.iter_weighting_exponent = iter_weighting_exponent
        self.sampler = sampler
        self.n_actions_traverser_samples = n_actions_traverser_samples

        # SINGLE
        self.export_each_net = export_each_net
        self.eval_agent_max_strat_buf_size = eval_agent_max_strat_buf_size

        # Different for dist and local
        if DISTRIBUTED or CLUSTER:
            print("Running with ", n_learner_actor_workers,
                  "LearnerActor Workers.")
            self.n_learner_actors = n_learner_actor_workers
        else:
            self.n_learner_actors = 1
        self.max_n_las_sync_simultaneously = max_n_las_sync_simultaneously

        assert isinstance(
            device_parameter_server,
            str), "Please pass a string (either 'cpu' or 'cuda')!"
        self.device_parameter_server = torch.device(device_parameter_server)
Esempio n. 4
0
    def __init__(
        self,

        # --- general
        name,
        log_export_freq=200,
        checkpoint_freq=99999999,
        lite_checkpoint=False,
        lite_checkpoint_steps=128000,
        export_hands_freq=10000,
        eval_agent_export_freq=99999999,

        # --- Computing
        path_data=None,
        local_crayon_server_docker_address="localhost",
        device_inference="cpu",
        device_parameter_server="cpu",
        n_learner_actor_workers=8,
        max_n_las_sync_simultaneously=100,
        DISTRIBUTED=False,
        CLUSTER=False,
        DEBUGGING=False,
        VERBOSE=True,
        TESTING=False,

        # --- env
        game_cls=StandardLeduc,
        n_seats=2,
        use_simplified_headsup_obs=True,
        start_chips=None,
        use_canonical=False,
        agent_bet_set=bet_sets.B_2,
        stack_randomization_range=(0, 0),
        uniform_action_interpolation=False,

        # --- Evaluation
        eval_modes_of_algo=(EvalAgentNFSP.EVAL_MODE_AVG, ),
        eval_stack_sizes=None,

        # --- NFSP
        nn_type="feedforward",
        nn_structure="paper",
        feedforward_env_builder=FlatLimitPokerEnvBuilder,
        anticipatory_parameter=0.1,
        first_and_third_units=1024,
        second_and_fourth_units=512,

        # Original NFSP also adds epsilon-exploration actions to the averaging buffer.
        add_random_actions_to_avg_buffer=True,
        n_br_updates_per_iter=2,
        n_avg_updates_per_iter=2,
        target_net_update_freq=300,  # every N neural net updates. Not every N global iters, episodes, or steps
        cir_buf_size_each_la=2e5,
        res_buf_size_each_la=2e6,  # the more the better to infinity
        min_prob_add_res_buf=0.0,  # 0.0 =  vanilla reservoir; >0 exponential averaging
        action_and_hand_buffer_size=20000,
        eps_start=0.06,
        eps_const=0.01,
        eps_exponent=0.5,
        eps_min=0.0,

        # --- Training.
        n_steps_per_iter_per_la=128,
        n_steps_pretrain_per_la=0,
        n_envs=128,
        mini_batch_size_br_per_la=128,
        n_mini_batches_per_la_per_update_br=1,  # total num of samples per iter is that * batch_size above.
        mini_batch_size_avg_per_la=128,
        n_mini_batches_per_la_per_update_avg=1,  # total num of samples per iter is that * batch_size above.
        training_multiplier_iter_0=1,  # In iter 0 the BR net is clueless, but adds to res_buf. -> "pretrain"

        # --- Q-Learning Hyperparameters
        n_cards_state_units_br=192,
        n_merge_and_table_layer_units_br=64,
        n_units_final_br=64,
        normalize_last_layer_flat=False,
        rnn_cls_str_br="lstm",
        rnn_units_br=128,
        rnn_stack_br=1,
        lr_br=0.1,
        dropout_br=0.0,
        use_pre_layers_br=True,  # True -> Use deep multi-branch network; False -> Use shallow net
        grad_norm_clipping_br=10.0,
        optimizer_br="sgd",
        loss_br="mse",

        # --- Avg Network Hyperparameters
        n_cards_state_units_avg=192,
        n_merge_and_table_layer_units_avg=64,
        n_units_final_avg=64,
        rnn_cls_str_avg="lstm",
        rnn_units_avg=128,
        rnn_stack_avg=1,
        lr_avg=0.005,
        dropout_avg=0.0,
        use_pre_layers_avg=True,  # True -> Use deep multi-branch network; False -> Use shallow net
        grad_norm_clipping_avg=10.0,
        optimizer_avg="sgd",
        loss_avg="ce",

        # Option
        lbr_args=None,
        rlbr_args=None,
        history_args=None,
        offline_args=None,
    ):
        print(" ************************** Initing args for: ", name,
              "  **************************")
        if nn_type == "recurrent":
            env_bldr_cls = HistoryEnvBuilder

            mpm_args_br = MPMArgsRNN(
                rnn_cls_str=rnn_cls_str_br,
                rnn_units=rnn_units_br,
                rnn_stack=rnn_stack_br,
                rnn_dropout=dropout_br,
                use_pre_layers=use_pre_layers_br,
                n_cards_state_units=n_cards_state_units_br,
                n_merge_and_table_layer_units=n_merge_and_table_layer_units_br)
            mpm_args_avg = MPMArgsRNN(
                rnn_cls_str=rnn_cls_str_avg,
                rnn_units=rnn_units_avg,
                rnn_stack=rnn_stack_avg,
                rnn_dropout=dropout_avg,
                use_pre_layers=use_pre_layers_avg,
                n_cards_state_units=n_cards_state_units_avg,
                n_merge_and_table_layer_units=n_merge_and_table_layer_units_avg
            )

        elif nn_type == "feedforward":
            env_bldr_cls = feedforward_env_builder
            if nn_structure == "paper":
                mpm_args_br = PaperMPMArgsFLAT(
                    first_units=first_and_third_units,
                    second_units=second_and_fourth_units,
                    third_units=first_and_third_units,
                    fourth_units=second_and_fourth_units,
                    normalize=normalize_last_layer_flat)
                mpm_args_avg = PaperMPMArgsFLAT(
                    first_units=first_and_third_units,
                    second_units=second_and_fourth_units,
                    third_units=first_and_third_units,
                    fourth_units=second_and_fourth_units)
            else:
                mpm_args_br = MPMArgsFLAT(
                    use_pre_layers=use_pre_layers_br,
                    card_block_units=n_cards_state_units_br,
                    other_units=n_merge_and_table_layer_units_br,
                    normalize=normalize_last_layer_flat,
                )
                mpm_args_avg = MPMArgsFLAT(
                    use_pre_layers=use_pre_layers_avg,
                    card_block_units=n_cards_state_units_avg,
                    other_units=n_merge_and_table_layer_units_avg)

        else:
            raise ValueError(nn_type)

        super().__init__(
            name=name,
            log_verbose=VERBOSE,
            log_export_freq=log_export_freq,
            checkpoint_freq=checkpoint_freq,
            export_hands_freq=export_hands_freq,
            eval_agent_export_freq=eval_agent_export_freq,
            path_data=path_data,
            game_cls=game_cls,
            env_bldr_cls=env_bldr_cls,
            start_chips=start_chips,
            eval_modes_of_algo=eval_modes_of_algo,
            eval_stack_sizes=eval_stack_sizes,
            DEBUGGING=DEBUGGING,
            TESTING=TESTING,
            DISTRIBUTED=DISTRIBUTED,
            CLUSTER=CLUSTER,
            device_inference=device_inference,
            local_crayon_server_docker_address=
            local_crayon_server_docker_address,
            module_args={
                "ddqn":
                DDQNArgs(
                    q_args=DuelingQArgs(
                        mpm_args=mpm_args_br,
                        n_units_final=n_units_final_br,
                    ),
                    cir_buf_size=int(cir_buf_size_each_la),
                    batch_size=mini_batch_size_br_per_la,
                    n_mini_batches_per_update=
                    n_mini_batches_per_la_per_update_br,
                    target_net_update_freq=target_net_update_freq,
                    optim_str=optimizer_br,
                    loss_str=loss_br,
                    lr=lr_br,
                    eps_start=eps_start,
                    eps_const=eps_const,
                    eps_exponent=eps_exponent,
                    eps_min=eps_min,
                    grad_norm_clipping=grad_norm_clipping_br,
                ),
                "avg":
                AvgWrapperArgs(
                    avg_net_args=AvrgNetArgs(
                        mpm_args=mpm_args_avg,
                        n_units_final=n_units_final_avg,
                    ),
                    batch_size=mini_batch_size_avg_per_la,
                    n_mini_batches_per_update=
                    n_mini_batches_per_la_per_update_avg,
                    res_buf_size=int(res_buf_size_each_la),
                    min_prob_add_res_buf=min_prob_add_res_buf,
                    loss_str=loss_avg,
                    optim_str=optimizer_avg,
                    lr=lr_avg,
                    grad_norm_clipping=grad_norm_clipping_avg,
                ),
                "env":
                game_cls.ARGS_CLS(
                    n_seats=n_seats,
                    starting_stack_sizes_list=[
                        start_chips for _ in range(n_seats)
                    ],
                    stack_randomization_range=stack_randomization_range,
                    use_simplified_headsup_obs=use_simplified_headsup_obs,
                    uniform_action_interpolation=uniform_action_interpolation,

                    # Set up in a way that just ignores this if not Discretized
                    bet_sizes_list_as_frac_of_pot=copy.deepcopy(agent_bet_set),
                ),
                "lbr":
                lbr_args,
                "rlbr":
                rlbr_args,
                "history":
                history_args,
                "offline":
                offline_args,
            })

        # ____________________________________________________ NFSP ____________________________________________________
        self.lite_checkpoint = lite_checkpoint
        self.lite_checkpoint_steps = lite_checkpoint_steps
        self.use_canonical = use_canonical
        self.nn_type = nn_type
        self.n_br_updates_per_iter = int(n_br_updates_per_iter)
        self.n_avg_updates_per_iter = int(n_avg_updates_per_iter)
        self.anticipatory_parameter = anticipatory_parameter
        self.add_random_actions_to_buffer = add_random_actions_to_avg_buffer
        self.training_multiplier_iter_0 = int(training_multiplier_iter_0)
        self.n_envs = int(n_envs)
        self.n_steps_pretrain_per_la = int(n_steps_pretrain_per_la)
        self.n_steps_per_iter_per_la = int(n_steps_per_iter_per_la)
        self.action_and_hand_buffer_size = action_and_hand_buffer_size

        if DISTRIBUTED or CLUSTER:
            self.n_learner_actors = int(n_learner_actor_workers)
        else:
            self.n_learner_actors = 1

        self.max_n_las_sync_simultaneously = int(max_n_las_sync_simultaneously)

        assert isinstance(
            device_parameter_server,
            str), "Please pass a string (either 'cpu' or 'cuda')!"
        self.device_parameter_server = torch.device(device_parameter_server)