# lr: learning rate, int: interval # warm up should be less than one epoch c.ddpg_update_batch_size = 100 c.ddpg_warmup_steps = 200 c.model_save_int = 100 # in episodes c.profile_int = 50 # in episodes def policy_noise(action): return t.clamp( add_clipped_normal_noise_to_action(action, c.policy_noise_params), -1, 1) if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) actor_t = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) critic = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic_t = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
c.root_dir = "/data/AI/tmp/multi_agent/mcarrier/naive_ppo_parallel/" # train configs # lr: learning rate, int: interval c.workers = 5 c.discount = 0.99 c.learning_rate = 3e-4 c.entropy_weight = None c.ppo_update_batch_size = 100 c.ppo_update_times = 50 c.ppo_update_int = 5 # = the number of episodes stored in ppo replay buffer c.model_save_int = c.ppo_update_int * 20 # in episodes c.profile_int = 50 # in episodes if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) critic = MW(Critic(observe_dim).to(c.device), c.device, c.device) actor.share_memory() critic.share_memory() logger.info("Networks created") # default replay buffer storage is main cpu mem
load_framework1 = "naive_hddpg" load_framework2 = "naive_ppo_parallel" load_trial1 = "2020_05_06_21_50_57" load_trial2 = "2020_05_06_21_50_57" test_root_dir = "" def load_framework(name): module = imp.import_module(".magent_" + name) return module.c, module.create_models, module.run_agents if __name__ == "__main__": c1, create_models1, run_agents1 = load_framework(load_framework1) c2, create_models2, run_agents2 = load_framework(load_framework1) save_env1 = SaveEnv(c1.root_dir, restart_use_trial=load_trial1) prep_args(c1, save_env1) save_env2 = SaveEnv(c2.root_dir, restart_use_trial=load_trial2) prep_args(c2, save_env2) c1.restart_from_trial = load_trial1 framework1 = create_models1() logger.info("Framework 1 initialized") c2.restart_from_trial = load_trial2 framework2 = create_models2() logger.info("Framework 2 initialized") operators = [(framework1, run_agents1, load_framework1), (framework2, run_agents2, load_framework2)]
replay_device=c.storage_device, replay_size=c.replay_size, entropy_weight=c.entropy_weight, discount=c.discount, update_times=c.ppo_update_times, batch_size=c.ppo_update_batch_size, learning_rate=c.learning_rate) if c.restart_from_trial is not None: ppo.load(save_env.get_trial_model_dir()) return ppo if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") ppo = create_models() logger.info("PPO framework initialized") # training # preparations ctx = get_context("spawn") pool = Pool(processes=c.workers, context=ctx) pool.enable_global_find(True)
q_decrease_rate=c.q_decrease_rate, discount=0.99, update_rate=0.005, batch_size=c.ddpg_update_batch_size, learning_rate=0.001, replay_size=c.replay_size, replay_device=c.storage_device) if c.restart_from_trial is not None: ddpg.load(save_env.get_trial_model_dir()) return ddpg if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") ddpg = create_models() logger.info("DDPG framework initialized") # training # preparations config = generate_combat_config(c.map_size) env = magent.GridWorld(config, map_size=c.map_size) env.reset()
c.root_dir = "/data/AI/tmp/multi_agent/walker/naive_ppo/" # train configs # lr: learning rate, int: interval c.discount = 0.99 c.learning_rate = 1e-3 c.entropy_weight = None c.ppo_update_batch_size = 100 c.ppo_update_times = 50 c.ppo_update_int = 5 # = the number of episodes stored in ppo replay buffer c.model_save_int = 100 # in episodes c.profile_int = 50 # in episodes if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW(Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) critic = MW(Critic(observe_dim).to(c.device), c.device, c.device) logger.info("Networks created") # default replay buffer storage is main cpu mem # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu, ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'),