def main():
    global PARTNERS
    num_partners = len(PARTNERS) if PARTNERS is not None else 1

    print("model path: ", model_path)
    net_arch = [args.netsz,args.latentz]
    partner_net_arch = [args.netsz,args.netsz]
    policy_kwargs = dict(activation_fn=nn.ReLU,
                         net_arch=[dict(vf=net_arch, pi=net_arch)],
                         partner_net_arch=[dict(vf=partner_net_arch, pi=partner_net_arch)],
                         num_partners=num_partners,
                         baseline=args.baseline,
                         nomain=args.nomain,
                         )

    def load_model_fn(partners, testing, try_load=True):
        return load_model(model_path=model_path, policy_class=HanabiPolicy, policy_kwargs=policy_kwargs, env=env, hp=HP, partners=partners, testing=testing, try_load=try_load)

    def learn_model_fn(model, timesteps, save, period):
        return learn(model, model_name=model_name, model_path=model_path, timesteps=timesteps, save=save, period=period)

    # TRAINING
    if not args.testing:
        print("#section Training")
        model = load_model_fn(partners=PARTNERS, testing=False)
        learn_model_fn(model, timesteps=args.timesteps, save=True, period=2000)

    ts, period = 25600, HP['n_steps_testing']
    # TESTING
    if args.testing and not args.zeroshot:
        if args.baseline:   adapt_partner_baseline(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False)
        else:               adapt_partner_modular(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False)
        adapt_partner_scratch(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False)

    if args.testing and args.zeroshot:
        adapt_task(load_model_fn, learn_model_fn, train_partners=TRAIN_PARTNERS, test_partners=TEST_PARTNERS, timesteps=args.timesteps, period=200)
def main():
    env = gym.make('blocks-v0',
                   grid_size=args.n,
                   vis1=args.vis1,
                   vis2=args.vis2,
                   one_sided_reward=args.onesided,
                   max_move_number=args.maxmovenumber)
    num_partners = len(PARTNERS) if PARTNERS is not None else 1

    print("model path: ", model_path)
    net_arch = [args.netsz, args.latentz]
    partner_net_arch = [args.netsz, args.netsz]
    policy_kwargs = dict(
        activation_fn=nn.ReLU,
        net_arch=[dict(vf=net_arch, pi=net_arch)],
        partner_net_arch=[dict(vf=partner_net_arch, pi=partner_net_arch)],
        num_partners=num_partners,
        baseline=args.baseline,
        nomain=args.nomain,
    )

    def load_model_fn(partners, testing, try_load=True):
        return load_model(model_path=model_path,
                          policy_class=BlocksPolicy,
                          policy_kwargs=policy_kwargs,
                          env=env,
                          hp=HP,
                          partners=partners,
                          testing=testing,
                          try_load=try_load)

    def learn_model_fn(model, timesteps, save, period):
        save_thresh = 19.2 if args.selfplay else None
        return learn(model,
                     model_name=model_name,
                     model_path=model_path,
                     timesteps=timesteps,
                     save=save,
                     period=period,
                     save_thresh=save_thresh)

    # TRAINING
    if not args.testing:
        print("#section Training")
        model = load_model_fn(partners=PARTNERS, testing=False)
        learn_model_fn(model, timesteps=args.timesteps, save=True, period=5000)

    ts, period = 25600, HP['n_steps_testing']
    # TESTING
    if args.testing and not args.zeroshot:
        if args.baseline:
            adapt_partner_baseline(load_model_fn,
                                   learn_model_fn,
                                   partners=PARTNERS,
                                   timesteps=ts,
                                   period=period,
                                   do_optimal=False)
        else:
            adapt_partner_modular(load_model_fn,
                                  learn_model_fn,
                                  partners=PARTNERS,
                                  timesteps=ts,
                                  period=period,
                                  do_optimal=False)

    if args.testing and args.zeroshot:
        adapt_task(load_model_fn,
                   learn_model_fn,
                   train_partners=TRAIN_PARTNERS,
                   test_partners=TEST_PARTNERS,
                   invert_train_partners=INVERTTRAIN_PARTNERS,
                   invert_test_partners=INVERTTEST_PARTNERS,
                   timesteps1=100000,
                   timesteps2=500000,
                   period=5000)
Esempio n. 3
0
def main(args):
    # global PARTNERS
    
    # Get model name and path
    model_name, model_path = get_model_name_and_path(args.run, mreg=args.mreg)
    
    # Hyperparameters TODO -> Work out a gin config for these
    HP = {
        'n_steps': 640,
        'n_steps_testing': 640,
        'batch_size': 160,
        'n_epochs': 5,
        'n_epochs_testing': 5,
        'mreg': args.mreg,
    }

    # Game config
    config = {
        "colors": args.colors,
        "ranks": args.ranks,
        "players": 2,
        "hand_size": args.hand_sz,
        "max_information_tokens": args.info,
        "max_life_tokens": args.life,
        "observation_type": pyhanabi.AgentObservationType.CARD_KNOWLEDGE.value
    }

    # TODO Need to integrate this with Anton's hmf
    # NOTE He is using gymt to make the env, which menas it is written 
    # in a gym style ?
    env = gym.make('hanabi-v0', config=config)

    # This is the list of partners. It will be None for Selfplay, adn set to 
    # the other partnerrs for Multiplay 
    PARTNERS = None

    if not args.selfplay:
        # TODO Inject Rulebased Partner Config
        setting, partner_type = "", "ppo"
        
        # get the config parameters of the hanabi partners
        TRAIN_PARTNERS, TEST_PARTNERS = get_hanabi_partners(setting, partner_type)
        PARTNERS = [TEST_PARTNERS[args.k % len(TEST_PARTNERS)]] if args.testing else TRAIN_PARTNERS
    
    
    
    num_partners = len(PARTNERS) if PARTNERS is not None else 1

    print("model path: ", model_path)
    
    net_arch = [    args.netsz,         # Network Dimensions
                    args.latentz        # Latent Dimensions  (Z)     
                ]

    # NOTE insertion point for rule-based agents -> Look at how to interface this with Umut's work
    # Define the architechture for hte parnter network
    partner_net_arch = [    args.netsz,
                            args.netsz
                        ]
    
    policy_kwargs = dict(activation_fn=nn.ReLU,
                        net_arch=[dict(vf=net_arch, pi=net_arch)],
                        
                        # NOTE this needs to be changed somehow to incorporate multiple partners
                        partner_net_arch=[dict(vf=partner_net_arch, pi=partner_net_arch)],
                        num_partners=num_partners,
                        baseline=args.baseline,
                        nomain=args.nomain,
                        )

    def load_model_fn(  partners, 
                        testing, 
                        try_load=True
                    ):
        return load_model(  model_path=model_path, 
                            policy_class=HanabiPolicy, 
                            policy_kwargs=policy_kwargs, 
                            env=env, 
                            hp=HP, 
                            partners=partners, 
                            testing=testing, 
                            try_load=try_load
                        )

    def learn_model_fn( model, 
                        timesteps, 
                        save, 
                        period
                    ):
        return learn(   model,
                        model_name=model_name, 
                        model_path=model_path, 
                        timesteps=timesteps, 
                        save=save, 
                        period=period
                    )

    # TRAINING
    if not args.testing:
        print("#Section Training")
        
        
        # PARTNERS is passed to load the model. Question is what role does it play ?
        model = load_model_fn(  partners=PARTNERS, 
                                testing=False
                            )
        
        learn_model_fn( model, 
                        timesteps=args.timesteps, 
                        save=True, 
                        period=2000
                    )

    ts, period = 25600, HP['n_steps_testing']

    # TESTING
    if args.testing and not args.zeroshot:
        if args.baseline:   
            adapt_partner_baseline( load_model_fn, 
                                    learn_model_fn, 
                                    partners=PARTNERS, 
                                    timesteps=ts, 
                                    period=period, 
                                    do_optimal=False
                                )
        
        else:               
            adapt_partner_modular(  load_model_fn, 
                                    learn_model_fn, 
                                    partners=PARTNERS, 
                                    timesteps=ts, 
                                    period=period, 
                                    do_optimal=False
                                )
        
        
        adapt_partner_scratch(  load_model_fn, 
                                learn_model_fn, 
                                partners=PARTNERS, 
                                timesteps=ts,
                                period=period, 
                                do_optimal=False
                            )

    if args.testing and args.zeroshot:
        adapt_task( load_model_fn, 
                    learn_model_fn, 
                    train_partners=TRAIN_PARTNERS, 
                    test_partners=TEST_PARTNERS, 
                    timesteps=args.timesteps, 
                    period=200
                )