def create_models(): config = generate_combat_config(c.map_size) view_shape, feature_shape, action_dim = get_agent_io_shapes(config, c.map_size)[0] actor = MW(Actor(view_shape, feature_shape, action_dim, c.conv).to(c.device), c.device, c.device) critic = MW(Critic(view_shape, feature_shape, c.conv).to(c.device), c.device, c.device) actor.share_memory() critic.share_memory() ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.storage_device, replay_size=c.replay_size, entropy_weight=c.entropy_weight, discount=c.discount, update_times=c.ppo_update_times, batch_size=c.ppo_update_batch_size, learning_rate=c.learning_rate) if c.restart_from_trial is not None: ppo.load(save_env.get_trial_model_dir()) return ppo
def create_models(): config = generate_combat_config(c.map_size) view_shape, feature_shape, action_dim = get_agent_io_shapes( config, c.map_size)[0] actor = MW( Actor(view_shape, feature_shape, action_dim, c.conv).to(c.device), c.device, c.device) actor_t = MW( Actor(view_shape, feature_shape, action_dim, c.conv).to(c.device), c.device, c.device) critic = MW( Critic(view_shape, feature_shape, c.conv).to(c.device), c.device, c.device) critic_t = MW( Critic(view_shape, feature_shape, c.conv).to(c.device), c.device, c.device) ddpg = HDDPG(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), q_increase_rate=c.q_increase_rate, q_decrease_rate=c.q_decrease_rate, discount=0.99, update_rate=0.005, batch_size=c.ddpg_update_batch_size, learning_rate=0.001, replay_size=c.replay_size, replay_device=c.storage_device) if c.restart_from_trial is not None: ddpg.load(save_env.get_trial_model_dir()) return ddpg
def policy_noise(action): return t.clamp( add_clipped_normal_noise_to_action(action, c.policy_noise_params), -1, 1) if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) actor_t = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) critic = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic_t = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic2 = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic2_t = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) logger.info("Networks created") # default replay buffer storage is main cpu mem # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu,
c.ddpg_update_batch_size = 100 c.ddpg_warmup_steps = 2000 c.ddpg_average_target_int = 200 c.model_save_int = 100 # in episodes c.profile_int = 50 # in episodes if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW(Actor(observe_dim, action_dim, 1)) actor_t = MW(Actor(observe_dim, action_dim, 1)) critic = MW(Critic(c.agent_num, observe_dim, action_dim)) critic_t = MW(Critic(c.agent_num, observe_dim, action_dim)) logger.info("Networks created") ddpg = MADDPG(c.agent_num, actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), sub_policy_num=c.sub_policy_num, discount=0.99, update_rate=0.005,
c.ppo_update_batch_size = 100 c.ppo_update_times = 4 c.ppo_update_int = 6 # = the number of episodes stored in ppo replay buffer c.model_save_int = 100 # in episodes c.profile_int = 50 # in episodes if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW(Actor(observe_dim, action_dim).to(c.device), c.device, c.device) critic = MW(Critic(observe_dim).to(c.device), c.device, c.device) logger.info("Networks created") # default replay buffer storage is main cpu mem # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu, ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size, entropy_weight=c.entropy_weight, discount=c.discount, update_times=c.ppo_update_times, learning_rate=c.learning_rate)