Example #1
0
# lr: learning rate, int: interval
# warm up should be less than one epoch
c.ddpg_update_batch_size = 100
c.ddpg_warmup_steps = 200
c.model_save_int = 100  # in episodes
c.profile_int = 50  # in episodes


def policy_noise(action):
    return t.clamp(
        add_clipped_normal_noise_to_action(action, c.policy_noise_params), -1,
        1)


if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    actor_t = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    critic = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic_t = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
Example #2
0
c.root_dir = "/data/AI/tmp/multi_agent/mcarrier/naive_ppo_parallel/"

# train configs
# lr: learning rate, int: interval
c.workers = 5
c.discount = 0.99
c.learning_rate = 3e-4
c.entropy_weight = None
c.ppo_update_batch_size = 100
c.ppo_update_times = 50
c.ppo_update_int = 5  # = the number of episodes stored in ppo replay buffer
c.model_save_int = c.ppo_update_int * 20  # in episodes
c.profile_int = 50  # in episodes

if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    critic = MW(Critic(observe_dim).to(c.device), c.device, c.device)
    actor.share_memory()
    critic.share_memory()
    logger.info("Networks created")

    # default replay buffer storage is main cpu mem
Example #3
0
load_framework1 = "naive_hddpg"
load_framework2 = "naive_ppo_parallel"
load_trial1 = "2020_05_06_21_50_57"
load_trial2 = "2020_05_06_21_50_57"
test_root_dir = ""


def load_framework(name):
    module = imp.import_module(".magent_" + name)
    return module.c, module.create_models, module.run_agents


if __name__ == "__main__":
    c1, create_models1, run_agents1 = load_framework(load_framework1)
    c2, create_models2, run_agents2 = load_framework(load_framework1)
    save_env1 = SaveEnv(c1.root_dir, restart_use_trial=load_trial1)
    prep_args(c1, save_env1)
    save_env2 = SaveEnv(c2.root_dir, restart_use_trial=load_trial2)
    prep_args(c2, save_env2)

    c1.restart_from_trial = load_trial1
    framework1 = create_models1()
    logger.info("Framework 1 initialized")

    c2.restart_from_trial = load_trial2
    framework2 = create_models2()
    logger.info("Framework 2 initialized")

    operators = [(framework1, run_agents1, load_framework1),
                 (framework2, run_agents2, load_framework2)]
Example #4
0
              replay_device=c.storage_device,
              replay_size=c.replay_size,
              entropy_weight=c.entropy_weight,
              discount=c.discount,
              update_times=c.ppo_update_times,
              batch_size=c.ppo_update_batch_size,
              learning_rate=c.learning_rate)

    if c.restart_from_trial is not None:
        ppo.load(save_env.get_trial_model_dir())

    return ppo


if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    ppo = create_models()
    logger.info("PPO framework initialized")

    # training
    # preparations
    ctx = get_context("spawn")
    pool = Pool(processes=c.workers, context=ctx)
    pool.enable_global_find(True)
Example #5
0
                 q_decrease_rate=c.q_decrease_rate,
                 discount=0.99,
                 update_rate=0.005,
                 batch_size=c.ddpg_update_batch_size,
                 learning_rate=0.001,
                 replay_size=c.replay_size,
                 replay_device=c.storage_device)

    if c.restart_from_trial is not None:
        ddpg.load(save_env.get_trial_model_dir())

    return ddpg


if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    ddpg = create_models()
    logger.info("DDPG framework initialized")

    # training
    # preparations
    config = generate_combat_config(c.map_size)
    env = magent.GridWorld(config, map_size=c.map_size)
    env.reset()
Example #6
0
c.root_dir = "/data/AI/tmp/multi_agent/walker/naive_ppo/"

# train configs
# lr: learning rate, int: interval
c.discount = 0.99
c.learning_rate = 1e-3
c.entropy_weight = None
c.ppo_update_batch_size = 100
c.ppo_update_times = 50
c.ppo_update_int = 5  # = the number of episodes stored in ppo replay buffer
c.model_save_int = 100  # in episodes
c.profile_int = 50  # in episodes


if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    critic = MW(Critic(observe_dim).to(c.device), c.device, c.device)
    logger.info("Networks created")

    # default replay buffer storage is main cpu mem
    # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu,
    ppo = PPO(actor, critic,
              t.optim.Adam, nn.MSELoss(reduction='sum'),