Esempio n. 1
0
def main():
    env = env_fn()
    print(env.observation_space)
    obs_size, = env.observation_space.shape
    act_size = env.action_space.n
    device = "cuda"
    policy_fn = lambda device: lambda: FCPolicy(obs_size, act_size, 512, device
                                                )
    data_store_size = 12800
    batch_size = 64
    n_envs = 8
    n_cpus = 0
    logger = make_logger("log")
    save_folder = "basic_test_save"

    run_loop(
        logger,
        lambda: DQNLearner(policy_fn("cuda"), 0.001, 0.99, logger, device),
        OccasionalUpdate(10, policy_fn("cpu")),
        lambda: StatelessActor(policy_fn("cuda")()),
        env_fn,
        Saver(save_folder),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        num_env_ids=n_envs,
        log_frequency=5,
        num_cpus=n_cpus,
        act_steps_until_learn=8000)
Esempio n. 2
0
def main():
    env = env_fn()
    cpu_count = mp.cpu_count()
    # cpu_count = 0
    num_envs = 8
    num_cpus = 0
    num_targets = 1
    model_features = 512
    data_store_size = 10000
    batch_size = 512
    max_grad_norm = 0.1
    device = "cuda"
    num_actors = 1
    max_learn_steps = 100000

    save_folder = "savedata/"

    def policy_fn_dev(device):
        policy = SACPolicy(env.observation_space, env.action_space, device)
        # load_latest(save_folder, policy)
        return policy

    priority_updater = NoUpdater()
    logger = make_logger("log")
    run_loop(
        logger,
        lambda: SACLearner(policy_fn_dev(device),
                           gamma=0.99,
                           T_max=max_learn_steps,
                           logger=logger,
                           device=device),
        OccasionalUpdate(100, lambda: policy_fn_dev("cpu")),
        lambda: StatelessActor(policy_fn_dev(device)),
        env_fn,
        Saver(save_folder),
        # MakeCPUAsyncConstructor(n_cpus),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        num_cpus=num_cpus,
        num_env_ids=num_envs,
        priority_updater=priority_updater,
        log_frequency=5,
        max_learn_steps=max_learn_steps,
        # act_steps_until_learn=10000,
        # num_actors=num_actors,
    )
Esempio n. 3
0
def main():
    n_envs = 8
    env_id = "CartPole-v0"
    # def env_fn():
    #     return continuous_actions(gym.make(env_id))
    env = env_fn()
    #print(env.observation_space)
    #obs_size, = env.observation_space.shape
    #act_size = env.action_space.n

    sb3_env = SpaceWrap(env)

    # print(sb3_env.action_space)
    # exit(0)
    n_timesteps = 1000
    save_path = "log"
    eval_freq = 50

    tensorboard_log = ""
    sb3_learner_fn = lambda device: TD3(env=sb3_env,
                                        tensorboard_log=tensorboard_log,
                                        policy=MlpPolicy,
                                        device=device)
    learner_fn = lambda: SB3LearnWrapper(sb3_learner_fn("cuda"))

    policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cuda").policy)
    example_policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cpu").policy)
    #learner = (model)
    learn_rate = lambda x: 0.01
    #policy = SB3Wrapper(model.policy)#MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu"))
    data_store_size = 12800
    batch_size = 512
    logger = make_logger("log")
    run_loop(
        logger,
        learner_fn,  #A2CLearner(policy, 0.001, 0.99, logger, device),
        OccasionalUpdate(10, example_policy_fn()),
        lambda: StatelessActor(policy_fn()),
        env_fn,
        MakeCPUAsyncConstructor(4),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5)
Esempio n. 4
0
def main():
    env = env_fn()
    print(env.observation_space)
    obs_size, = env.observation_space.shape
    act_size = env.action_space.n
    device = "cuda"
    policy_fn = lambda: FCPolicy(obs_size, act_size, 64, device)
    data_store_size = 128000
    batch_size = 64
    logger = make_logger("log")
    run_loop(logger,
             lambda: DQNLearner(policy_fn(), 0.001, 0.99, logger, device),
             OccasionalUpdate(10, FCPolicy(obs_size, act_size, 64, "cpu")),
             lambda: StatelessActor(policy_fn()),
             env_fn,
             SyncVectorEnv,
             lambda: TransitionAdder(env.observation_space, env.action_space),
             DensitySampleScheme(data_store_size),
             data_store_size,
             batch_size,
             n_envs=32,
             log_frequency=5)
Esempio n. 5
0
def main():

    save_folder = "savedata/"
    def policy_fn_dev(device,is_learner=False):
        device = torch.device(device)
        policy = Agent(device, args, env,logger,priority_updater,is_learner=is_learner)
        load_latest(save_folder, policy)
        return policy
    data_store_size = 500000
    batch_size = 256
    args.batch_size = batch_size
    n_envs = 32
    n_cpus = 32
    priority_updater = PriorityUpdater()
    logger = make_logger("log")
    print("cpu create")

    print("cpu finish create")
    run_loop(
        logger,
        lambda: policy_fn_dev("cuda:0",is_learner=True),#DDPGLearner(policy_fn, reward_normalizer_fn, 0.001, 0.99, 0.1, logger, priority_updater, device),
        OccasionalUpdate(100, lambda: policy_fn_dev("cpu")),
        lambda: StatelessActor(policy_fn_dev("cuda:0")),
        env_fn,
        Saver(save_folder),
        #MakeCPUAsyncConstructor(n_cpus),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),#, alpha=0.5, beta_fn=lambda x:0.),
        data_store_size,
        batch_size,
        act_steps_until_learn=200000,
        num_env_ids=n_envs,
        num_cpus=n_cpus,
        priority_updater=priority_updater,
        log_frequency=5.,
        max_learn_steps=10000000,
    )
    print("loopterm")
Esempio n. 6
0
def main():
    env = env_fn()
    print(env.observation_space)
    obs_size, = env.observation_space.shape
    act_size = env.action_space.n
    device = "cpu"
    policy = FCPolicy(obs_size, act_size, 64, device)
    data_store_size = 12800
    batch_size = 16
    logger = make_logger("log")
    run_loop(
        logger,
        DQNLearner(policy, 0.001, 0.99, logger, device),
        OccasionalUpdate(10, policy),
        StatelessActor(policy),
        env_fn,
        ConcatVecEnv,
        lambda: TransitionAdder(env.observation_space, env.action_space),
        DensitySampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5
    )
Esempio n. 7
0
def main():
    env = env_fn()
    cpu_count = mp.cpu_count()
    # cpu_count = 0
    num_envs = 8
    num_cpus = 4
    num_targets = 1
    model_features = 512
    data_store_size = 500000
    batch_size = 512
    max_grad_norm = 0.1
    num_actions = env.action_space.n
    device = "cuda"
    num_actors = 1
    max_learn_steps = 100000

    # venv = MakeCPUAsyncConstructor(cpu_count)([env_fn]*num_envs, env.observation_space, env.action_space)
    # venv.reset()
    def model_fn():
        return FlatModel(env.observation_space.shape[0])

    save_folder = "savedata/"

    def policy_fn_dev(device):
        policy = DiversityPolicy(model_fn, model_features, num_actions,
                                 num_targets, obs_preproc, device)
        load_latest(save_folder, policy)
        return policy

    policy_fn = lambda: policy_fn_dev(device)
    priority_updater = NoUpdater()
    logger = make_logger("log")
    run_loop(
        logger,
        lambda: DiversityLearner(discount_factor=0.99,
                                 obs_preproc=obs_preproc,
                                 model_fn=model_fn,
                                 max_learn_steps=max_learn_steps,
                                 model_features=model_features,
                                 logger=logger,
                                 device=device,
                                 num_targets=num_targets,
                                 num_actions=num_actions),
        OccasionalUpdate(10, lambda: policy_fn_dev("cpu")),
        lambda: TargetUpdaterActor(policy_fn(),
                                   num_envs // num_actors,
                                   num_targets,
                                   target_staggering=1.314),
        env_fn,
        Saver(save_folder),
        # MakeCPUAsyncConstructor(n_cpus),
        lambda: TargetTransitionAdder(env.observation_space, env.action_space,
                                      num_targets),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        num_cpus=num_cpus,
        num_env_ids=num_envs,
        priority_updater=priority_updater,
        log_frequency=5,
        max_learn_steps=max_learn_steps,
        act_steps_until_learn=10000,
        # num_actors=num_actors,
    )