def main(): global PARTNERS num_partners = len(PARTNERS) if PARTNERS is not None else 1 print("model path: ", model_path) net_arch = [args.netsz,args.latentz] partner_net_arch = [args.netsz,args.netsz] policy_kwargs = dict(activation_fn=nn.ReLU, net_arch=[dict(vf=net_arch, pi=net_arch)], partner_net_arch=[dict(vf=partner_net_arch, pi=partner_net_arch)], num_partners=num_partners, baseline=args.baseline, nomain=args.nomain, ) def load_model_fn(partners, testing, try_load=True): return load_model(model_path=model_path, policy_class=HanabiPolicy, policy_kwargs=policy_kwargs, env=env, hp=HP, partners=partners, testing=testing, try_load=try_load) def learn_model_fn(model, timesteps, save, period): return learn(model, model_name=model_name, model_path=model_path, timesteps=timesteps, save=save, period=period) # TRAINING if not args.testing: print("#section Training") model = load_model_fn(partners=PARTNERS, testing=False) learn_model_fn(model, timesteps=args.timesteps, save=True, period=2000) ts, period = 25600, HP['n_steps_testing'] # TESTING if args.testing and not args.zeroshot: if args.baseline: adapt_partner_baseline(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False) else: adapt_partner_modular(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False) adapt_partner_scratch(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False) if args.testing and args.zeroshot: adapt_task(load_model_fn, learn_model_fn, train_partners=TRAIN_PARTNERS, test_partners=TEST_PARTNERS, timesteps=args.timesteps, period=200)
def main(): env = gym.make('blocks-v0', grid_size=args.n, vis1=args.vis1, vis2=args.vis2, one_sided_reward=args.onesided, max_move_number=args.maxmovenumber) num_partners = len(PARTNERS) if PARTNERS is not None else 1 print("model path: ", model_path) net_arch = [args.netsz, args.latentz] partner_net_arch = [args.netsz, args.netsz] policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=net_arch, pi=net_arch)], partner_net_arch=[dict(vf=partner_net_arch, pi=partner_net_arch)], num_partners=num_partners, baseline=args.baseline, nomain=args.nomain, ) def load_model_fn(partners, testing, try_load=True): return load_model(model_path=model_path, policy_class=BlocksPolicy, policy_kwargs=policy_kwargs, env=env, hp=HP, partners=partners, testing=testing, try_load=try_load) def learn_model_fn(model, timesteps, save, period): save_thresh = 19.2 if args.selfplay else None return learn(model, model_name=model_name, model_path=model_path, timesteps=timesteps, save=save, period=period, save_thresh=save_thresh) # TRAINING if not args.testing: print("#section Training") model = load_model_fn(partners=PARTNERS, testing=False) learn_model_fn(model, timesteps=args.timesteps, save=True, period=5000) ts, period = 25600, HP['n_steps_testing'] # TESTING if args.testing and not args.zeroshot: if args.baseline: adapt_partner_baseline(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False) else: adapt_partner_modular(load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False) if args.testing and args.zeroshot: adapt_task(load_model_fn, learn_model_fn, train_partners=TRAIN_PARTNERS, test_partners=TEST_PARTNERS, invert_train_partners=INVERTTRAIN_PARTNERS, invert_test_partners=INVERTTEST_PARTNERS, timesteps1=100000, timesteps2=500000, period=5000)
def main(args): # global PARTNERS # Get model name and path model_name, model_path = get_model_name_and_path(args.run, mreg=args.mreg) # Hyperparameters TODO -> Work out a gin config for these HP = { 'n_steps': 640, 'n_steps_testing': 640, 'batch_size': 160, 'n_epochs': 5, 'n_epochs_testing': 5, 'mreg': args.mreg, } # Game config config = { "colors": args.colors, "ranks": args.ranks, "players": 2, "hand_size": args.hand_sz, "max_information_tokens": args.info, "max_life_tokens": args.life, "observation_type": pyhanabi.AgentObservationType.CARD_KNOWLEDGE.value } # TODO Need to integrate this with Anton's hmf # NOTE He is using gymt to make the env, which menas it is written # in a gym style ? env = gym.make('hanabi-v0', config=config) # This is the list of partners. It will be None for Selfplay, adn set to # the other partnerrs for Multiplay PARTNERS = None if not args.selfplay: # TODO Inject Rulebased Partner Config setting, partner_type = "", "ppo" # get the config parameters of the hanabi partners TRAIN_PARTNERS, TEST_PARTNERS = get_hanabi_partners(setting, partner_type) PARTNERS = [TEST_PARTNERS[args.k % len(TEST_PARTNERS)]] if args.testing else TRAIN_PARTNERS num_partners = len(PARTNERS) if PARTNERS is not None else 1 print("model path: ", model_path) net_arch = [ args.netsz, # Network Dimensions args.latentz # Latent Dimensions (Z) ] # NOTE insertion point for rule-based agents -> Look at how to interface this with Umut's work # Define the architechture for hte parnter network partner_net_arch = [ args.netsz, args.netsz ] policy_kwargs = dict(activation_fn=nn.ReLU, net_arch=[dict(vf=net_arch, pi=net_arch)], # NOTE this needs to be changed somehow to incorporate multiple partners partner_net_arch=[dict(vf=partner_net_arch, pi=partner_net_arch)], num_partners=num_partners, baseline=args.baseline, nomain=args.nomain, ) def load_model_fn( partners, testing, try_load=True ): return load_model( model_path=model_path, policy_class=HanabiPolicy, policy_kwargs=policy_kwargs, env=env, hp=HP, partners=partners, testing=testing, try_load=try_load ) def learn_model_fn( model, timesteps, save, period ): return learn( model, model_name=model_name, model_path=model_path, timesteps=timesteps, save=save, period=period ) # TRAINING if not args.testing: print("#Section Training") # PARTNERS is passed to load the model. Question is what role does it play ? model = load_model_fn( partners=PARTNERS, testing=False ) learn_model_fn( model, timesteps=args.timesteps, save=True, period=2000 ) ts, period = 25600, HP['n_steps_testing'] # TESTING if args.testing and not args.zeroshot: if args.baseline: adapt_partner_baseline( load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False ) else: adapt_partner_modular( load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False ) adapt_partner_scratch( load_model_fn, learn_model_fn, partners=PARTNERS, timesteps=ts, period=period, do_optimal=False ) if args.testing and args.zeroshot: adapt_task( load_model_fn, learn_model_fn, train_partners=TRAIN_PARTNERS, test_partners=TEST_PARTNERS, timesteps=args.timesteps, period=200 )