Example #1
0
    def __init__(self, env, learning_rate, buffer_size, batch_size, n_epochs,
                 gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm):
        self.env = env
        self.lr = learning_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.gamma = gamma
        self.gae_lam = gae_lam
        self.clip_range = clip_range
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.num_timesteps = 0

        self.ep_info_buffer = deque(maxlen=100)
        self._n_updates = 0
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        if isinstance(env, VecEnv):
            self.num_envs = env.num_envs

        self.rms_obs = RunningMeanStd(shape=(1, 1, 84, 84))
        self.rms_rew = RunningMeanStd()

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        logger.configure('./logs')
Example #2
0
def test_main(tmp_path):
    """
    tests for the logger module
    """
    logger = configure(None, ["stdout"])
    logger.info("hi")
    logger.debug("shouldn't appear")
    assert logger.level == INFO
    logger.set_level(DEBUG)
    assert logger.level == DEBUG
    logger.debug("should appear")
    logger = configure(folder=str(tmp_path))
    assert logger.dir == str(tmp_path)
    logger.record("a", 3)
    logger.record("b", 2.5)
    logger.dump()
    logger.record("b", -2.5)
    logger.record("a", 5.5)
    logger.dump()
    logger.info("^^^ should see a = 5.5")
    logger.record("f", "this text \n \r should appear in one line")
    logger.dump()
    logger.info(
        '^^^ should see f = "this text \n \r should appear in one line"')
    logger.record_mean("b", -22.5)
    logger.record_mean("b", -44.4)
    logger.record("a", 5.5)
    logger.dump()

    logger.record("a", "longasslongasslongasslongasslongasslongassvalue")
    logger.dump()
    logger.warn("hey")
    logger.error("oh")
Example #3
0
def test_main(tmp_path):
    """
    tests for the logger module
    """
    info("hi")
    debug("shouldn't appear")
    set_level(DEBUG)
    debug("should appear")
    configure(folder=str(tmp_path))
    record("a", 3)
    record("b", 2.5)
    dump()
    record("b", -2.5)
    record("a", 5.5)
    dump()
    info("^^^ should see a = 5.5")
    record_mean("b", -22.5)
    record_mean("b", -44.4)
    record("a", 5.5)
    dump()
    with ScopedConfigure(None, None):
        info("^^^ should see b = 33.3")

    with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]):
        record("b", -2.5)
        dump()

    reset()
    record("a", "longasslongasslongasslongasslongasslongassvalue")
    dump()
    warn("hey")
    error("oh")
    record_dict({"test": 1})
Example #4
0
def train(output_folder, load_path):
    base_output = Path(output_folder)
    full_output = base_output / datetime.datetime.now().isoformat(
        timespec="seconds")
    # latest = base_output / "latest"
    # latest.symlink_to(full_output)

    logger.configure(folder=str(full_output))

    env = LoveLetterMultiAgentEnv(num_players=4,
                                  reward_fn=Rewards.fast_elimination_reward)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        # def test_fn(env):
        #     return env.valid_action_mask()
        #
        model = PPO(MlpPolicy, env, verbose=1,
                    ent_coef=0.05)  #, action_mask_fn=test_fn)

    other_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    # other_agents = [
    #     PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env),
    # ]
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    # ]
    agents = [model, *other_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(
        env,
        best_model_save_path=str(full_output),
        log_path=str(full_output),
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
    )

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(str(full_output / "final_model"))

    env.close()
def test_main(tmp_path):
    """
    tests for the logger module
    """
    info("hi")
    debug("shouldn't appear")
    assert get_level() == INFO
    set_level(DEBUG)
    assert get_level() == DEBUG
    debug("should appear")
    configure(folder=str(tmp_path))
    assert get_dir() == str(tmp_path)
    record("a", 3)
    record("b", 2.5)
    dump()
    record("b", -2.5)
    record("a", 5.5)
    dump()
    info("^^^ should see a = 5.5")
    record("f", "this text \n \r should appear in one line")
    dump()
    info('^^^ should see f = "this text \n \r should appear in one line"')
    record_mean("b", -22.5)
    record_mean("b", -44.4)
    record("a", 5.5)
    dump()
    with ScopedConfigure(None, None):
        info("^^^ should see b = 33.3")

    with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]):
        record("b", -2.5)
        dump()

    reset()
    record("a", "longasslongasslongasslongasslongasslongassvalue")
    dump()
    warn("hey")
    error("oh")
    record_dict({"test": 1})
    assert isinstance(get_log_dict(), dict) and set(get_log_dict().keys()) == {"test"}
Example #6
0
def configure_logger(
    verbose: int = 0,
    tensorboard_log: Optional[str] = None,
    tb_log_name: str = "",
    reset_num_timesteps: bool = True,
) -> None:
    """
    Configure the logger's outputs.

    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param tb_log_name: tensorboard log
    """
    if tensorboard_log is not None and SummaryWriter is not None:
        latest_run_id = get_latest_run_id(tensorboard_log, tb_log_name)
        if not reset_num_timesteps:
            # Continue training in the same directory
            latest_run_id -= 1
        save_path = os.path.join(tensorboard_log,
                                 f"{tb_log_name}_{latest_run_id + 1}")
        if verbose >= 1:
            logger.configure(save_path, ["stdout", "tensorboard"])
        else:
            logger.configure(save_path, ["tensorboard"])
    elif verbose == 0:
        logger.configure(format_strings=[""])
Example #7
0
def test_set_logger(tmp_path):
    # set up logger
    new_logger = configure(str(tmp_path), ["stdout", "csv", "tensorboard"])
    # Default outputs with verbose=0
    model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4)
    assert model.logger.output_formats == []

    model = A2C("MlpPolicy",
                "CartPole-v1",
                verbose=0,
                tensorboard_log=str(tmp_path)).learn(4)
    assert str(tmp_path) in model.logger.dir
    assert isinstance(model.logger.output_formats[0], TensorBoardOutputFormat)

    # Check that env variable work
    new_tmp_path = str(tmp_path / "new_tmp")
    os.environ["SB3_LOGDIR"] = new_tmp_path
    model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4)
    assert model.logger.dir == new_tmp_path

    # Default outputs with verbose=1
    model = A2C("MlpPolicy", "CartPole-v1", verbose=1).learn(4)
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    # with tensorboard
    model = A2C("MlpPolicy",
                "CartPole-v1",
                verbose=1,
                tensorboard_log=str(tmp_path)).learn(4)
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    assert isinstance(model.logger.output_formats[1], TensorBoardOutputFormat)
    assert len(model.logger.output_formats) == 2
    model.learn(32)
    # set new logger
    model.set_logger(new_logger)
    # Check that the new logger is correctly setup
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    assert isinstance(model.logger.output_formats[1], CSVOutputFormat)
    assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat)
    assert len(model.logger.output_formats) == 3
    model.learn(32)

    model = A2C("MlpPolicy", "CartPole-v1", verbose=1)
    model.set_logger(new_logger)
    model.learn(32)
    # Check that the new logger is not overwritten
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    assert isinstance(model.logger.output_formats[1], CSVOutputFormat)
    assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat)
    assert len(model.logger.output_formats) == 3
Example #8
0
def test_main():
    """
    tests for the logger module
    """
    info("hi")
    debug("shouldn't appear")
    set_level(DEBUG)
    debug("should appear")
    folder = "/tmp/testlogging"
    if os.path.exists(folder):
        shutil.rmtree(folder)
    configure(folder=folder)
    logkv("a", 3)
    logkv("b", 2.5)
    dumpkvs()
    logkv("b", -2.5)
    logkv("a", 5.5)
    dumpkvs()
    info("^^^ should see a = 5.5")
    logkv_mean("b", -22.5)
    logkv_mean("b", -44.4)
    logkv("a", 5.5)
    dumpkvs()
    with ScopedConfigure(None, None):
        info("^^^ should see b = 33.3")

    with ScopedConfigure("/tmp/test-logger/", ["json"]):
        logkv("b", -2.5)
        dumpkvs()

    reset()
    logkv("a", "longasslongasslongasslongasslongasslongassvalue")
    dumpkvs()
    warn("hey")
    error("oh")
    logkvs({"test": 1})
Example #9
0
def configure_logger(
    verbose: int = 0,
    tensorboard_log: Optional[str] = None,
    tb_log_name: str = "",
    reset_num_timesteps: bool = True,
) -> Logger:
    """
    Configure the logger's outputs.

    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param tb_log_name: tensorboard log
    :param reset_num_timesteps:  Whether the ``num_timesteps`` attribute is reset or not.
        It allows to continue a previous learning curve (``reset_num_timesteps=False``)
        or start from t=0 (``reset_num_timesteps=True``, the default).
    :return: The logger object
    """
    save_path, format_strings = None, ["stdout"]

    if tensorboard_log is not None and SummaryWriter is None:
        raise ImportError(
            "Trying to log data to tensorboard but tensorboard is not installed."
        )

    if tensorboard_log is not None and SummaryWriter is not None:
        latest_run_id = get_latest_run_id(tensorboard_log, tb_log_name)
        if not reset_num_timesteps:
            # Continue training in the same directory
            latest_run_id -= 1
        save_path = os.path.join(tensorboard_log,
                                 f"{tb_log_name}_{latest_run_id + 1}")
        if verbose >= 1:
            format_strings = ["stdout", "tensorboard"]
        else:
            format_strings = ["tensorboard"]
    elif verbose == 0:
        format_strings = [""]
    return configure(save_path, format_strings=format_strings)
Example #10
0
            eval_env,
            best_model_save_path='best_model/' + name,
            log_path='best_model/' + name + '/',
            eval_freq=n_timesteps_episode * args.eval_freq,
            deterministic=True,
            render=False,
            n_eval_episodes=args.eval_length)
        callbacks.append(eval_callback)

    # Set up tensorboard logger
    if args.tensorboard:
        log_callback = LoggerCallback(sinergym_logger=bool(args.logger))
        callbacks.append(log_callback)
        # lets change default dir for TensorboardFormatLogger only
        tb_path = args.tensorboard + '/' + name
        new_logger = configure(tb_path, ["tensorboard"])
        model.set_logger(new_logger)

    callback = CallbackList(callbacks)

    # ---------------------------------------------------------------------------- #
    #                                   TRAINING                                   #
    # ---------------------------------------------------------------------------- #
    model.learn(total_timesteps=timesteps,
                callback=callback,
                log_interval=args.log_interval)
    model.save(env.simulator._env_working_dir_parent + '/' + name)

    # If the algorithm doesn't reset or close the environment, this script will do it in
    # order to correctly log all the simulation data (Energyplus + Sinergym
    # logs)
Example #11
0
# Train single CPU PPO1 on slimevolley.
# Should solve it (beat existing AI on average over 1000 trials) in 3 hours on single CPU, within 3M steps.

import os

import click
from stable_baselines3.common import logger
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.ppo import MlpPolicy, PPO

from gym_love_letter.agents import RandomAgent
from gym_love_letter.envs.base import LoveLetterMultiAgentEnv

LOGDIR = "ppo"  # moved to zoo afterwards.
logger.configure(folder=LOGDIR)

SEED = 721

# NUM_TIMESTEPS = int(2e7)
# EVAL_FREQ = 250000
# EVAL_EPISODES = 1000
NUM_TIMESTEPS = 300000
EVAL_FREQ = 5000
EVAL_EPISODES = 50


@click.command()
@click.option("--load", "-l", "load_path")
def train(load_path):
    env = LoveLetterMultiAgentEnv(num_players=4)
def train(env_id, num_timesteps, seed):
    # sess = util.single_threaded_session()
    # sess.__enter__()
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    # Create a new base directory like //home/marco/Reinforcement_Learning/Logs/openai-2018-05-21-12-27

    log_dir = os.path.join(
        energyplus_logbase_dir(),
        datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M"))
    if not os.path.exists(log_dir + '/output'):
        os.makedirs(log_dir + '/output')
    os.environ["ENERGYPLUS_LOG"] = log_dir
    model = os.getenv('ENERGYPLUS_MODEL')
    if model is None:
        print('Environment variable ENERGYPLUS_MODEL is not defined')
        exit()
    weather = os.getenv('ENERGYPLUS_WEATHER')
    if weather is None:
        print('Environment variable ENERGYPLUS_WEATHER is not defined')
        exit()

    # MPI is to parallelize training
    # Logs the training in a file log.txt in the given directory

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        print('train: init logger with dir={}'.format(log_dir))  # XXX
        logger.configure(log_dir)
    else:
        logger.configure(format_strings=[])
        logger.set_level(logger.DISABLED)

    # Make Gym environment:

    env = make_energyplus_env(env_id, workerseed)

    ###### EXPERIMENTS FROM FIRST PAPER: ###########################################
    #
    # trpo_mpi.learn(env,  policy_fn,
    #                max_timesteps=num_timesteps,
    #                timesteps_per_batch=16*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
    #                gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)

    # Apply TRPO algorithm from OpenAI baselines:

    # action_noise = NormalActionNoise(mean=np.zeros(4), sigma=0.1 * np.ones(4))
    #
    # policy_kwargs_tqc = dict(n_critics=2, n_quantiles=25)
    # model_tqc = TQC("MlpPolicy", env, top_quantiles_to_drop_per_net=2
    #                 , verbose=1, policy_kwargs=policy_kwargs_tqc)
    #
    # model_ppo = PPO('MlpPolicy', env, verbose=1, n_steps=4096, batch_size=64, n_epochs=15)
    # model_td3 = TD3('MlpPolicy', env, verbose=1, action_noise=action_noise)
    # model_sac = SAC('MlpPolicy', env, verbose=1)
    # model_ppolstm = PPO2(MlpLstmPolicy, env, verbose=1,n_steps=27, nminibatches=1)
    #
    # # Change the algorithm here:
    #
    # model_ppolstm.learn(total_timesteps=num_timesteps, log_interval=1, reset_num_timesteps=False)
    # # model_ppo.learning_rate = 0
    # # model_ppo.learn(total_timesteps=35040, reset_num_timesteps=False)
    #
    #####################################EXPERIMENTS 2: ###################################

    sac_v2_lstm(env, num_timesteps, train=True, test=False)
    #slac(env, num_timesteps)

    env.close()
if __name__ == '__main__':  # this is required due to forking processes
    run_id = str(uuid.uuid4())  # ALL running environments must share this
    print(f"RUN ID: {run_id}")

    # to pass launch args, add to env_kwargs: 'launch_args': ['render:=false', 'plot_log:=true']
    env = make_vec_env(RocketLeagueInterface,
                       env_kwargs={'run_id': run_id},
                       n_envs=24,
                       vec_env_cls=SubprocVecEnv)

    model = PPO("MlpPolicy", env)

    # log training progress as CSV
    log_dir = expanduser(f'~/catkin_ws/data/rocket_league/{run_id}')
    logger = configure(log_dir, ["stdout", "csv", "log"])
    model.set_logger(logger)

    # log model weights
    freq = 20833  # save 20 times
    # freq = steps / (n_saves * n_envs)
    callback = CheckpointCallback(save_freq=freq, save_path=log_dir)

    # run training
    steps = 240000000  # 240M (10M sequential)
    print(f"training on {steps} steps")
    model.learn(total_timesteps=steps, callback=callback)

    # save final weights
    print("done training")
    model.save(log_dir + "/final_weights")
Example #14
0
            env_vec,
            best_model_save_path='best_model/' + name + '/',
            log_path='best_model/' + name + '/',
            eval_freq=n_timesteps_episode *
            args.eval_freq,
            deterministic=True,
            render=False,
            n_eval_episodes=args.eval_length)
        callbacks.append(eval_callback)

    # Set up tensorboard logger
    if args.tensorboard:
        log_callback = LoggerCallback(sinergym_logger=bool(args.logger))
        callbacks.append(log_callback)
        # lets change default dir for TensorboardFormatLogger only
        tb_path = args.tensorboard + '/' + name
        new_logger = configure(tb_path, ["tensorboard,stdout"])
        model.set_logger(new_logger)

    callback = CallbackList(callbacks)

    # Training
    model.learn(
        total_timesteps=timesteps,
        callback=callback,
        log_interval=args.log_interval)
    model.save(env.simulator._env_working_dir_parent + '/' + name)

    # End mlflow run
    mlflow.end_run()
Example #15
0
                    "rollout/ep_len_mean",
                    safe_mean([ep_info["l"]
                               for ep_info in a2c.ep_info_buffer]))
            logger.record("time/fps", fps)
            logger.record("time/time_elapsed",
                          int(time.time() - a2c.start_time),
                          exclude="tensorboard")
            logger.record("time/total_timesteps",
                          a2c.num_timesteps,
                          exclude="tensorboard")
            logger.dump(step=a2c.num_timesteps)

        a2c.train()

    # obs = cat.reset()
    # print("obs_shape")
    # print(obs.shape)
    # for i in range(10000):
    #     obs,rew,done,info = cat.step([0]*cat.num_envs)
    # print(obs.shape)
    # print(rew.shape)


configure("log", format_strings='stdout,log,csv'.split(','))
#with ScopedConfigure("log", format_strings='stdout,log,csv'.split(',')):
main()
#res = ProcConcatVec([env_contr])
# env = aec_to_markov(env)
# env.reset()
# env.step([0]*len(env.agents))
import random
import time
from copy import deepcopy

import gym
import matplotlib
import numpy as np
import pandas as pd
from gym import spaces
from stable_baselines3.common.logger import configure
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

matplotlib.use("Agg")

# tf_logger = configure('/home/l/code/python/log/run', format_strings=['tensorboard'])
tf_logger = configure('/home/l/code/python/log/env', format_strings=['log'])
logger = configure('/home/l/code/python/log')

class StockTradingEnvCashpenalty(gym.Env):
    """
    A stock trading environment for OpenAI gym
    This environment penalizes the model for not maintaining a reserve of cash.
    This enables the model to manage cash reserves in addition to performing trading procedures.
    Reward at any step is given as follows
        r_i = (sum(cash, asset_value) - initial_cash - max(0, sum(cash, asset_value)*cash_penalty_proportion-cash))/(days_elapsed)
        This reward function takes into account a liquidity requirement, as well as long-term accrued rewards.
    Parameters:
        df (pandas.DataFrame): Dataframe containing data
        buy_cost_pct (float): cost for buying shares
        sell_cost_pct (float): cost for selling shares
        hmax (int, array): maximum cash to be traded in each trade per asset. If an array is provided, then each index correspond to each asset