def main():
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    if not USE_LOADED_MODEL:
        model = ACKTR('MlpPolicy', env, verbose=1)

        # Multiprocessed RL Training
        start_time = time.time()
        model.learn(total_timesteps=n_timesteps, log_interval=10)
        total_time_multi = time.time() - start_time

        model.save("cartpole_v1_acktr")

    loaded_model = ACKTR.load("cartpole_v1_acktr")
    loaded_model.set_env(env)

    # Single Process RL Training
    single_process_model = ACKTR('MlpPolicy', env_id, verbose=1)
    start_time = time.time()
    single_process_model.learn(n_timesteps)
    total_time_single = time.time() - start_time

    print("Single-process: {0}s, Multi-process: {1}s".format(
        total_time_single, total_time_multi))

    # create separate clean environment for evaluation
    eval_env = gym.make(env_id)
    mean_reward, std_reward = evaluate_policy(loaded_model,
                                              eval_env,
                                              n_eval_episodes=10)
    print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
Example #2
0
def train(env_id, num_timesteps, seed, num_cpu):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
Example #3
0
def run():
    torch.multiprocessing.freeze_support()
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = ACKTR(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
def acktr(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = ACKTR(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with ACKTR.")
    model.learn(total_timesteps=timesteps)

    env.close()
def optimize_agent(trial):
    agent = PPO2
    policy = MlpLstmPolicy
    train_env, test_env = optimize_envs(trial)

    if agent == ACKTR:
        params = optimize_acktr(trial)
        model = ACKTR(policy, train_env, verbose=1,
                      tensorboard_log="./tensorboard", **params)
    elif agent == PPO2:
        params = optimize_ppo2(trial)
        model = PPO2(policy, train_env, verbose=1, nminibatches=1,
                     tensorboard_log="./tensorboard", **params)

    model.test_env = test_env
    model.trial = trial

    try:
        model.learn(n_timesteps, callback=learn_callback)
        model.env.close()
        test_env.close()
    except AssertionError:
        # Sometimes, random hyperparams can generate NaN
        model.env.close()
        model.test_env.close()
        raise

    is_pruned = False
    cost = np.inf

    if hasattr(model, 'is_pruned'):
        is_pruned = model.is_pruned  # pylint: disable=no-member
        cost = -1 * model.last_mean_test_reward  # pylint: disable=no-member

    del model.env, model.test_env
    del model

    if is_pruned:
        raise optuna.structs.TrialPruned()

    return cost
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_acktr(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = ACKTR("MlpPolicy", env, nprocs=1, verbose=0, **model_params)
    print("DOING LEARING acer")
    original_env.force_progression = False
    model.learn(int(2e4), seed=seed)
    print("DONE LEARING acer")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
Example #7
0
def train_acktr(seed):
    """
    test ACKTR on the uav_env(cartesian,discrete) 
    """
    """
    ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, 
    vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, 
    lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, 
    async_eigen_decomp=False)
    """
    algo = 'ACKTR'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20,
                  ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25,
                  max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
                  tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo),
                  _init_setup_model=True)
    # , async_eigen_decomp=False)

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = ACKTR.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
Example #8
0
def stable_baseline_test(env_origin):
    env = make_vec_env(lambda: env_origin, n_envs=1)
    model = ACKTR('CnnPolicy', env_origin, verbose=1)
    model.learn(total_timesteps=2000000)
    print("Stable_baseline evaluation starts.....\n")
    #NOTE:evaluate_policy needs vec_env
    reward_mean, reward_std = evaluate_policy(model,
                                              env,
                                              n_eval_episodes=20,
                                              deterministic=False)

    print("mean reward:" + str(reward_mean) + '\n')
    print("reward std:" + str(reward_std) + '\n')

    print("custom evaluation begin\n")

    env = env_origin
    obs = env.reset()
    reward_list_total = []
    epilen_list = []
    reward_list = []
    last_end = 0
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        reward_list.append(rewards)
        if dones:
            obs = env.reset()
            epilen_list.append(i - last_end)
            last_end = i
            reward_list_total.append(np.sum(reward_list))
            reward_list = []
            if i > 900:
                break
    print("mean reward:{}\n".format(np.mean(reward_list_total)))
    print("mean epilen:{}\n".format(np.mean(epilen_list)))
Example #9
0
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario = str(
        f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}')
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env, log_path=scenario, n_eval_episodes=5)
    ])

    model = ACKTR(MlpPolicy,
                  env,
                  gamma=gamma,
                  n_steps=batch_size,
                  learning_rate=LR,
                  verbose=1)  #, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)

    filename = './%s/evaluations.npz' % scenario

    data = np.load(filename)
    y = np.average(results, axis=1)
    y = results[:, 0]
    timesteps = data['timesteps']
    plt.plot(timesteps, y)

    plt.xlabel('Timesteps')
    plt.ylabel('Score')
    #plt.show()

    savepath = './%s/fig_%s' % (scenario, scenario)
    plt.savefig(savepath)
Example #10
0
    n_steps += 1
    # Returning False will stop training early
    return True


env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
if os.path.isfile(model_file):
    model = ACKTR.load(model_file, env=env)
else:
    model = ACKTR(
        MlpLnLstmPolicy,
        env,
        tensorboard_log=f"./test{base_test_file}/",
        verbose=0
    )  # add tensorboard_log="./test/" and run tensorboard --logdir /Users/constantin/Documents/bn/rl/test/PPO2_1
model.learn(total_timesteps=10**5, callback=callback)

# def evaluate(model, num_steps=1000):
#     obs = env.reset()
#     for i in range(num_steps):
#         # _states are only useful when using LSTM policies
#         action, _states = model.predict(obs)
#
#         obs, reward, done, info = env.step(action)
#         env.render()
#
#
# model = PPO2.load("/home/constantin/Desktop/projects/disertation/rl_logs_1_1-20200120T201830Z-001/rl_logs_1_1/1_best_model399.pkl")
# evaluate(model, 30)
        if (n_steps + 1) % 100000 == 0:
            print("Saving checkpoint model")
            _locals['self'].save(model_dir + 'model_{}_steps.pkl'.format(n_steps + 1))

        n_steps += 1
        return True


    print('Starting Training')
    """
    ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25,
     vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001,
      lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True,
       async_eigen_decomp=False, policy_kwargs=None, full_tensorboard_log=False)
    """

    model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20,
                  ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25,
                  max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
                  tensorboard_log=None, _init_setup_model=True)

    model.learn(total_timesteps=num_timesteps, callback=custom_callback, seed=seed,
                log_interval=100)

    print('Starting evaluation')
    env = setup_env_cart_discrete(seed, log_dir)
    model.set_env(env)

    get_trajectories(model, trajectory_dir, n_trajectories=100)
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv(10, 10)])

model = ACKTR(get_policy(policy),
              env,
              verbose=0,
              tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag)

model.save(model_folder + "ACKTR_A2C" + model_tag)
del model
model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
Example #13
0
import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from env import GoLeftEnv
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

env = GoLeftEnv(grid_size=10)
env = make_vec_env(lambda: env, n_envs=1)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=0.9,
                                                 verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=callback_on_best,
                             verbose=1)

model = ACKTR('MlpPolicy', env, verbose=1)
model.learn(int(1e10), callback=eval_callback)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

model.save('models/best')

env.close()
Example #14
0
    # This function will only work for a single Environment
    env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            episode_rewards.append(reward)

        print("Episode", i, "Reward:", sum(episode_rewards))
        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    min_episode_reward = np.min(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Min reward:",
          min_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward


# Test the trained agent
evaluate(model, num_episodes=100)
model.learn(500)
evaluate(model, num_episodes=100)
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: NegativeRewardEnv(map_name='map1')])

model = ACKTR(get_policy(policy),
              env,
              verbose=0,
              tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=2500000, tb_log_name='ACKTR_A2C_map1' + model_tag)

model.save(model_folder + "ACKTR_A2C_map1" + model_tag)
del model
model = ACKTR.load(model_folder + "ACKTR_A2C_map1" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
        # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value
        model.learn(total_timesteps=1000000,
                    reset_num_timesteps=False,
                    callback=callback)
        model.save(log_dir + 'model_PPO_' + str(id + 1))

    if args.algo == "acktr":
        id = balboa.utils.tensorboard_latest_directory_number(
            log_dir, 'ACKTR_')
        print('Using acktr')
        if args.load_id == None:
            # tensorboard_log=log_dir
            model = ACKTR("MlpPolicy",
                          env,
                          policy_kwargs=policy_kwargs,
                          ent_coef=0.0,
                          verbose=1)
            # verbose=1, n_steps=48, learning_rate=0.1, lr_schedule='constant',
        else:
            print("Loading model: " + str(args.load_id))
            model = ACKTR.load(log_dir + 'ACKTR_' + str(args.load_id) + ".zip",
                               env=env)
        model.tensorboard_log = log_dir
        # model.learning_rate = stable_baselines.common.schedules.LinearSchedule(1.0, 0.06, initial_p=0.06).value
        # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value

        model.learn(total_timesteps=3000000,
                    reset_num_timesteps=False,
                    callback=callback)
        print("Saving to: " + log_dir + 'ACKTR_' + str(id + 1))
        model.save(log_dir + 'model_ACKTR_' + str(id + 1))
Example #17
0
                  gamma=config['gamma'],
                  policy_kwargs=config['policy_kwargs'],
                  verbose=1,
                  tensorboard_log=save_path)

elif config['algorithm'] == 'PPO2':
    env = make_vec_env(lambda: env, n_envs=1)
    model = PPO2(config['policy_network'],
                 env,
                 learning_rate=config['learning_rate'],
                 gamma=config['gamma'],
                 policy_kwargs=config['policy_kwargs'],
                 verbose=1,
                 tensorboard_log=save_path)

elif config['algorithm'] == 'DQN':
    model = DQN(
        config['policy_network'],
        env,
        learning_rate=config['learning_rate'],
        buffer_size=config['buffer_size'],
        target_network_update_freq=64,
        gamma=config['gamma'],  # policy_kwargs = config['policy_kwargs'],
        verbose=1,
        tensorboard_log=save_path)

model.learn(config['total_steps'], callback=callback)
model.save(os.path.join(save_path, 'model'))

env.close()
Example #18
0
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: ActionMaskEnv()])

model = ACKTR(get_policy(policy),
              env,
              verbose=0,
              gae_lambda=0.95,
              tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=2500000, tb_log_name='ACKTR_PPO2' + model_tag)

model.save(model_folder + "ACKTR_PPO2" + model_tag)
del model
model = ACKTR.load(model_folder + "ACKTR_PPO2" + model_tag)

done = False
states = None
action_masks = []
obs = env.reset()

while not done:
    action, states = model.predict(obs, states, action_mask=action_masks)
    obs, _, done, infos = env.step(action)
    env.render()
    action_masks.clear()
Example #19
0
def train(environment, algorithm, timesteps):
    from envs import cpa, mountain_car

    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.bench import Monitor
    from stable_baselines import PPO2, ACKTR, DQN, A2C

    now = datetime.now()
    current_time = now.strftime("%Y-%m-%d-%H-%M-%S")

    training_info_dir = "training_info" + os.path.sep
    current_training_info = "{}-{}-{}".format(current_time, algorithm, environment)
    current_training_info_dir = training_info_dir + current_training_info + os.path.sep

    model_file_path = current_training_info_dir + "model"
    log_file_path = current_training_info_dir + "monitor.csv"

    tensorboard_dir = training_info_dir + TENSORBOARD_DIR_NAME + os.path.sep

    dirs_to_create = [model_file_path, tensorboard_dir, model_file_path]

    for directory in dirs_to_create:
        create_dir(directory)

    env = None

    if environment == 'cpa_sparse':
        env = cpa.CPAEnvSparse()
    elif environment == 'cpa_dense':
        env = cpa.CPAEnvDense()
    elif environment == 'mc_sparse':
        env = mountain_car.MountainCarSparseEnv()
    elif environment == 'mc_dense':
        env = mountain_car.MountainCarDenseEnv()
    else:
        raise Exception("Environment '{}' is unknown.".format(environment))

    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    env = Monitor(env, filename=log_file_path, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = None

    if algorithm == 'acktr':
        model = ACKTR('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    elif algorithm == 'ppo':
        model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    elif algorithm == 'a2c':
        model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    elif algorithm == 'dqn':
        model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    else:
        raise Exception("Algorithm '{}' is unknown.".format(algorithm))

    # Train the agent
    model.learn(total_timesteps=timesteps, tb_log_name=current_training_info)

    model.save(model_file_path)

    print("Finished training model: {}. Saved training info in: {}".format(model, current_training_info_dir))
Example #20
0
def test_action_mask_learn_acktr(vec_env, policy, env_class):
    env = vec_env([env_class]*2)

    model = ACKTR(policy, env, verbose=0)
    model.learn(total_timesteps=500)
    env.close()
Example #21
0
class ACKTR_Agent:
    def __init__(self, params: Params):
        self.params: Params = params
        policy_name = self.params.agent_config['policy']
        self.policy = eval(policy_name)

    def create_model(self, n_envs=1):
        """ Create env and agent model """
        env_cls = SprEnv
        self.env = make_vec_env(env_cls,
                                n_envs=n_envs,
                                env_kwargs={"params": self.params},
                                seed=self.params.seed)
        self.model = ACKTR(
            self.policy,
            self.env,
            gamma=self.params.agent_config['gamma'],
            n_steps=self.params.agent_config['n_steps'],
            ent_coef=self.params.agent_config['ent_coef'],
            vf_coef=self.params.agent_config['vf_coef'],
            vf_fisher_coef=self.params.agent_config['vf_fisher_coef'],
            max_grad_norm=self.params.agent_config['max_grad_norm'],
            learning_rate=self.params.agent_config['learning_rate'],
            gae_lambda=self.params.agent_config['gae_lambda'],
            lr_schedule=self.params.agent_config['lr_schedule'],
            kfac_clip=self.params.agent_config['kfac_clip'],
            kfac_update=self.params.agent_config['kfac_update'],
            async_eigen_decomp=self.params.agent_config['async_eigen_decomp'],
            verbose=self.params.agent_config['verbose'],
            tensorboard_log="./tb/acktr/",
            seed=self.params.seed,
            policy_kwargs={"params": self.params})

    def train(self):
        with ProgressBarManager(self.params.training_duration) as callback:
            self.model.learn(total_timesteps=self.params.training_duration,
                             tb_log_name=self.params.tb_log_name,
                             callback=callback)

    def test(self):
        self.params.test_mode = True
        obs = self.env.reset()
        self.setup_writer()
        episode = 1
        step = 0
        episode_reward = [0.0]
        done = False
        # Test for 1 episode
        while not done:
            action, _states = self.model.predict(obs)
            obs, reward, dones, info = self.env.step(action)
            episode_reward[episode - 1] += reward[0]
            if info[0]['sim_time'] >= self.params.testing_duration:
                done = True
                self.write_reward(episode, episode_reward[episode - 1])
                episode += 1
            sys.stdout.write(
                "\rTesting:" +
                f"Current Simulator Time: {info[0]['sim_time']}. Testing duration: {self.params.testing_duration}"
            )
            sys.stdout.flush()
            step += 1
        print("")

    def save_model(self):
        """ Save the model to a zip archive """
        self.model.save(self.params.model_path)

    def load_model(self, path=None):
        """ Load the model from a zip archive """
        if path is not None:
            self.model = ACKTR.load(path)
        else:
            self.model = ACKTR.load(self.params.model_path)
            # Copy the model to the new directory
            self.model.save(self.params.model_path)

    def setup_writer(self):
        episode_reward_filename = f"{self.params.result_dir}/episode_reward.csv"
        episode_reward_header = ['episode', 'reward']
        self.episode_reward_stream = open(episode_reward_filename,
                                          'a+',
                                          newline='')
        self.episode_reward_writer = csv.writer(self.episode_reward_stream)
        self.episode_reward_writer.writerow(episode_reward_header)

    def write_reward(self, episode, reward):
        self.episode_reward_writer.writerow([episode, reward])
            n_episodes += len(episode_rewards[i])

        # Compute mean reward
        mean_reward = round(np.mean(mean_rewards), 1)
        print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward

    # Random Agent, before training
    mean_reward_before_train = evaluate(model, num_steps=1000)

    n_timesteps = 25000

    # Multiprocessed RL Training
    start_time = time.time()
    model.learn(n_timesteps)
    total_time_multi = time.time() - start_time

    print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format(
        total_time_multi, n_timesteps / total_time_multi))

    # Evaluate the trained agent
    mean_reward = evaluate(model, num_steps=10000)

    # Single Process RL Training
    single_process_model = ACKTR(MlpPolicy,
                                 DummyVecEnv([lambda: gym.make(env_id)]),
                                 verbose=0)

    start_time = time.time()
    single_process_model.learn(n_timesteps)
Example #23
0
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import ACKTR

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("acktr_cartpole")

del model  # remove to demonstrate saving and loading

model = ACKTR.load("acktr_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Example #24
0
    #env = CustomEnv(3, 6, "tcp://*:5556")
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    # Create log dir
    log_dir = "Logs/Custom_env/"
    os.makedirs(log_dir, exist_ok=True)
    # Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=500,
                                                log_dir=log_dir)

    #env = Monitor(env, log_dir)

    model = ACKTR(MlpPolicy, env, verbose=2)
    #model.load("DQN_agent")
    model.learn(total_timesteps=20000, callback=callback)
    model.save("temp_agent")

    a = input("Training completed")

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        probs = model.action_probability(obs)
        obs, rewards, dones, info = env.step(action)
        print("Observation:", obs, rewards, probs)

    results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS,
                                 "Lane Manager")
    plt.show()
Example #25
0
        episode_rewards = []
        done = False
        obs = env.reset()
        env.render()
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            print(reward)
            env.render()
            episode_rewards.append(reward)

        print("Episode", i, "Reward:", sum(episode_rewards))
        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    min_episode_reward = np.min(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Min reward:",
          min_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward


# Test the trained agent
# evaluate(model, num_episodes=100)
# evaluate(model, num_episodes=5)
model.learn(100000)
# evaluate(model, num_episodes=5)
#         index = np.argmin(best_mean_reward)
#         if mean_reward > best_mean_reward[index]:
#             best_mean_reward[index] = mean_reward
#             print('best_mean_reward', best_mean_reward)
#             _locals['self'].save(log_dir + 'best_model_{}.pkl'.format(str(mean_reward)))
#     n_steps += 1
#     return False

# log_dir = 'LiveStream_1229/ACKTRCust3_deletem8_zhongwang_diff_delay/'
log_dir = 'ACKTRtest/'

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
tstart = time.time()
num_cpu = 2

env = SubprocVecEnv([make_env(i, log_dir) for i in range(num_cpu)])

model = ACKTR(
    env=env,
    policy=LstmCust3Policy,
    verbose=1,
)

model.learn(total_timesteps=int(5e6), callback=callback)
model.save(log_dir + "last_model")

print('Time taken: {:.2f}'.format(time.time() - tstart))