Esempio n. 1
0
config = {
    "num_team_a": 1,
    "num_team_b": 1,
    "width": 5,
    "height": 3,
    "density": 0.0,
    "max_turns": 50
}
a_config = {"hp": 30, "skill": 50, "fray": 40, "dmg": "2d10+6"}
b_config = {"hp": 30, "skill": 50, "fray": 40, "dmg": "2d10+6"}

ray.init()

env = epenv.EP_Environment(config, a_config, b_config)
check_env(env)

register_env("ep_environment",
             lambda _: epenv.EP_Environment(config, a_config, b_config))

trainer = PPO2(env=env,
               config={
                   "multiagent": {
                       "policies": {
                           "one":
                           (None, env.observation_space, env.action_space, {}),
                           "two":
                           (None, env.observation_space, env.action_space, {}),
                       }
                   }
               })
Esempio n. 2
0
    print('Saving the config file in path: {}'.format(specified_path))
    with open(join(specified_path, 'config.yml'), 'w') as f:
        yaml.dump(config, f, indent=4, sort_keys=False, line_break=' ')

    # train model
    try:
        try:
            model_path = join(specified_path, 'pretrained-model.zip')
            model = PPO2.load(model_path,
                              env=env_8,
                              tensorboard_log=specified_path)
            print("Existing model loaded...")

        except:
            model = PPO2(policy,
                         env=env_8,
                         tensorboard_log=specified_path,
                         **model_config)
            print('New model created..')

        # Launch the tensorboard
        if args.tensorboard:
            launch_tensorboard(specified_path)

        start = datetime.now()
        print('Start time training: {}'.format(start))
        model.learn(total_timesteps=n_steps,
                    tb_log_name='{}_{}'.format(max_in_dir, args.name),
                    callback=eval_callback)
        model_path = join(specified_path,
                          '{}_final_model.zip'.format(max_in_dir))
        model.save(model_path)
Esempio n. 3
0
    else:

        if args.multi:

            #setup random offset of network ports
            os.environ['DONKEY_SIM_MULTI'] = '1'

            # Number of processes to use
            num_cpu = 4

            # Create the vectorized environment
            env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

            #create recurrent policy
            model = PPO2(CnnLstmPolicy, env, verbose=1)

        else:

            #make gym env
            env = gym.make(env_id)

            # Create the vectorized environment
            env = DummyVecEnv([lambda: env])

            #create cnn policy
            model = PPO2(CnnPolicy, env, verbose=1)

        #set up model in learning mode with goal number of timesteps to complete
        model.learn(total_timesteps=10000)
Esempio n. 4
0
            agent = PPO2.load(
                agent_params["pretrained_agent"].value,
                env=env,
                reset_num_timesteps=False,
                n_steps=agent_params["update_nepisodes"].value *
                (eng_params["nsteps"].value - 1),
                learning_rate=agent_params["learning_rate"].value,
                gamma=agent_params["gamma"].value,
                tensorboard_log=logdir,
            )
        else:
            agent = PPO2(
                MlpPolicy,
                env,
                verbose=1,
                n_steps=agent_params["update_nepisodes"].value *
                (eng_params["nsteps"].value - 1),
                learning_rate=agent_params["learning_rate"].value,
                gamma=agent_params["gamma"].value,
                tensorboard_log=logdir,
            )
        agent.learn(
            total_timesteps=agent_params["number_episodes"].value *
            (eng_params["nsteps"].value - 1) * agent_params["nranks"].value,
            callback=callback,
        )
    elif agent_params["agent"].value == "manual":
        env = DummyVecEnv([lambda: eng])
        agent = agents.ManualAgent(env)
        agent.learn(agent_params["injection_cas"].value,
                    agent_params["qdot_cas"].value)
Esempio n. 5
0
import time
import json
import random
import sys

np.set_printoptions(threshold=sys.maxsize)

env = DummyVecEnv(
    [lambda: SwapTradingEnv(data_file='./data/DATA.parquet', training=True)])

test_env = DummyVecEnv(
    [lambda: SwapTradingEnv(data_file='./data/TEST.parquet', training=True)])

# env = SubprocVecEnv([lambda: SwapTradingEnv(
#             data_file='/home/thorad/Core/Projects/SwapTrader/data/BTC-USD-SWAP-FRACDIFF.parquet'
#         ) for i in range(2)])

model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard")

for x in range(5):
    model.learn(50000)
    model.save('./agents/agent_' + str(x) + '.pkl')

    obs = test_env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = test_env.step(action)
        print(reward)
        print(info)
        wandb.log(info[0])
Esempio n. 6
0
    'eta': 1,
    'X_kunit': 0.49,
    'theta': 0.1
}  #if a parameter is set to None it will be sampled from a uniform distribution at every reset
args = {
    'feedback': feedback,
    'q': qs,
    'params': params
}  #i parametri di default son questi: rewfunc=Tools.purity_like_rew,q=1e-4,dt=1e-3,plot=False,pow=0.5
#instantiate environment
env = make_vec_env(FisherEnv, n_envs=N, env_kwargs=args)
#instantiate model
model = PPO2(MlpPolicy,
             env,
             n_steps=128,
             learning_rate=LR,
             lam=0.95,
             ent_coef=e_c,
             verbose=1,
             nminibatches=4,
             noptepochs=4,
             tensorboard_log='./Fisher_mix_TRAIN_LOG/{}/{}_q{}'.format(
                 dirname, title, qs),
             seed=2)
#train the model
model.learn(total_timesteps=TIMESTEPS,
            callback=callback,
            tb_log_name='{}_q{}'.format(title, qs))
#save the trained model at a given path
model.save('./MODELS/{}/{}_q{}'.format(dirname, title, qs))
Esempio n. 7
0
def objective(trial):
    # Define what to optimize in environment
    envParams = {
        'reward_func':
        reward_strategy,
        'forecast_len':
        int(trial.suggest_loguniform('forecast_len', 1, 200)),
        'confidence_interval':
        trial.suggest_uniform('confidence_interval', 0.7, 0.99),
    }
    train_df, test_df = getDatasets(
        params.get('input_data_file'),
        percentageToUse=params.get('dataset_percentage'))
    trainEnv = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, **envParams)])
    testEnv = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, **envParams)])

    # Define what to optimize in agent
    agentParams = {
        'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)),
        'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
        'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4),
        'noptepochs': int(trial.suggest_loguniform('noptepochs', 1, 48)),
        'lam': trial.suggest_uniform('lam', 0.8, 1.)
    }

    model = PPO2(MlpLnLstmPolicy,
                 trainEnv,
                 verbose=0,
                 nminibatches=1,
                 **agentParams)

    # Run optimizer
    last_reward = -np.finfo(np.float16).max
    evaluation_interval = int(len(train_df) / params.get('n_test_episodes'))

    for eval_idx in range(params.get('n_evaluations')):
        try:
            model.learn(evaluation_interval)
        except AssertionError:
            raise

        rewards = []
        n_episodes, reward_sum = 0, 0.0

        obs = testEnv.reset()
        while n_episodes < params.get('n_test_episodes'):
            action, _ = model.predict(obs)
            obs, reward, done, _ = testEnv.step(action)
            reward_sum += reward

            if done:
                rewards.append(reward_sum)
                reward_sum = 0.0
                n_episodes += 1
                obs = testEnv.reset()

        last_reward = np.mean(rewards)
        trial.report(-1 * last_reward, eval_idx)

        if trial.should_prune(eval_idx):
            raise optuna.structs.TrialPruned()

    return -1 * last_reward
Esempio n. 8
0
                f.close()
        else:
            save_path = '../logs/'
            env = Monitor(env, '../logs/')                                   # logging monitor
        model_dir = save_path + '{}_final_model'.format(args.alg)                                       # model save/load directory

        if args.alg == 'ddpg':
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=args.action_noise * np.ones(n_actions))

            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(args.param_noise_stddev),
                                                 desired_action_stddev=float(args.param_noise_stddev))
            model = DDPG(DDPGPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise,
                         render=args.play)
        elif args.alg == 'ppo2':
            model = PPO2(CommonMlpPolicy, env, verbose=1)
        elif args.alg == 'trpo':
            model = TRPO(CommonMlpPolicy, env, verbose=1, model_dir=save_path)
        elif args.alg =='a2c':
            model = A2C(CommonMlpPolicy, env, verbose=1)
        else:
            print(args.alg)
            raise Exception('Algorithm name is not defined!')

        print('Model is Created')
        try:
            print('Training Started')
            if args.alg == 'ddpg':
                model.learn(total_timesteps=args.num_timesteps, log_interval=args.log_interval, save_path=save_path)
            else:
                model.learn(total_timesteps=args.num_timesteps, log_interval=args.log_interval)
Esempio n. 9
0
from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO
from stable_baselines.ddpg import AdaptiveParamNoiseSpec
from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox
from stable_baselines.common.vec_env import DummyVecEnv

PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2))

# Hyperparameters for learning identity for each RL model
LEARN_FUNC_DICT = {
    'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'dqn': lambda e: DQN(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG).learn(total_timesteps=1000),
    'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'sac': lambda e: SAC(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
}


@pytest.mark.slow
@pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo'])
def test_identity(model_name):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param model_name: (str) Name of the RL model
    """
    env = DummyVecEnv([lambda: IdentityEnv(10)])
Esempio n. 10
0
def train(
        task,
        alg,
        logdir,
        domain_name,
        *,
        random_seed=None,
        num_steps=int(2e3),
        log_every=int(10e3),
        num_parallel=8,
        load_policy=False,
        load_policy_dir="",
        **kwargs
):
    """Train and evaluate an agent

    Args:
        task (str): Jitterbug task to train on
        alg (str): Algorithm to train, one of;
            - 'ddpg': DDPG Algorithm
            - 'ppo2': PPO2 Algorithm
            - 'sac': SAC Algorithm
        logdir (str): Logging directory
        domain_name (str): Name of the DMC domain

        random_seed (int): Random seed to use, or None
        num_steps (int): Number of training steps to train for
        log_every (int): Save and log progress every this many timesteps
        num_parallel (int): Number of parallel environments to run. Only used
        load_policy (bool): Whether to load an existing or not. It Yes, the policy is loaded from logdir.
            for A2C and PPO2.
    """

    assert alg in ('ddpg', 'sac', 'ppo2', 'td3'), "Invalid alg: {}".format(alg)
    assert domain_name in ('jitterbug', 'augmented_jitterbug'), "Invalid domain_name: {}".format(domain_name)

    # Cast args to types
    if random_seed is not None:
        random_seed = int(random_seed)
    else:
        random_seed = int(time.time())

    # Fix random seed
    random.seed(random_seed)
    np.random.seed(random_seed)

    # Prepare the logging directory
    os.makedirs(logdir, exist_ok=True)

    print("Training {} on {} with seed {} for {} steps "
          "(log every {}), saving to {}".format(
        alg,
        task,
        random_seed,
        num_steps,
        log_every,
        logdir
    ))

    if domain_name == "augmented_jitterbug":
        augmented_jitterbug.augment_Jitterbug(modify_legs=True,
                                              modify_mass=True,
                                              modify_coreBody1=False,
                                              modify_coreBody2=False,
                                              modify_global_density=False,
                                              modify_gear=False,
                                              )
    # Construct DMC env
    env_dmc = suite.load(
        domain_name=domain_name,
        task_name=task,
        task_kwargs=dict(random=random_seed, norm_obs=True),
        environment_kwargs=dict(flat_observation=True)
    )

    # Wrap gym env in a dummy parallel vector
    if alg in ('ppo2'):

        if num_parallel > multiprocessing.cpu_count():
            warnings.warn("Number of parallel workers "
                          "({}) > CPU count ({}), setting to # CPUs - 1".format(
                num_parallel,
                multiprocessing.cpu_count()
            ))
            num_parallel = max(
                1,
                multiprocessing.cpu_count() - 1
            )

        print("Using {} parallel environments".format(num_parallel))
        # XXX ajs 13/Sep/19 Hack to create multiple monitors that don't write to the same file
        env_vec = SubprocVecEnv([
            lambda: Monitor(
                gym.wrappers.FlattenDictWrapper(
                    jitterbug_dmc.JitterbugGymEnv(env_dmc),
                    dict_keys=["observations"]
                ),
                os.path.join(logdir, str(random.randint(0, 99999999))),
                allow_early_resets=True
            )
            for n in range(num_parallel)
        ])

    else:

        num_parallel = 1
        env_vec = DummyVecEnv([
            lambda: Monitor(
                gym.wrappers.FlattenDictWrapper(
                    jitterbug_dmc.JitterbugGymEnv(env_dmc),
                    dict_keys=["observations"]
                ),
                logdir,
                allow_early_resets=True
            )
        ])

    # Record start time
    start_time = datetime.datetime.now()

    def _cb(_locals, _globals):
        """Callback for during training"""

        if 'last_num_eps' not in _cb.__dict__:
            _cb.last_num_eps = 0

        # Extract episode reward history based on model type
        if isinstance(_locals['self'], DDPG):
            ep_r_hist = list(_locals['episode_rewards_history'])
        elif isinstance(_locals['self'], PPO2):
            ep_r_hist = [d['r'] for d in _locals['ep_info_buf']]
        elif isinstance(_locals['self'], SAC):
            ep_r_hist = [d['r'] for d in _locals['ep_info_buf']]
        elif isinstance(_locals['self'], TD3):
            ep_r_hist = [d['r'] for d in _locals['ep_info_buf']]
        else:
            raise ValueError("Invalid algorithm: {}".format(
                _locals['self']
            ))

        # Compute # elapsed steps based on # elapsed episodes
        ep_size = int(
            jitterbug_dmc.jitterbug.DEFAULT_TIME_LIMIT /
            jitterbug_dmc.jitterbug.DEFAULT_CONTROL_TIMESTEP
        )
        num_eps = len(ep_r_hist)
        elapsed_steps = ep_size * num_eps

        # Compute elapsed time in seconds
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()

        # Log some info
        if num_eps != _cb.last_num_eps:
            _cb.last_num_eps = num_eps

            print("{:.2f}s | {}ep | {}#: episode reward = "
                  "{:.2f}, last 5 episode reward = {:.2f}".format(
                elapsed_time,
                num_eps,
                elapsed_steps,
                ep_r_hist[-1],
                np.mean(ep_r_hist[-5:])
            ))

            # Save model checkpoint
            model_path = os.path.join(logdir, "model.pkl")
            print("Saved checkpoint to {}".format(model_path))
            _locals['self'].save(model_path)

        return True

    if alg == 'ddpg':

        # Default parameters for DDPG
        # kwargs.setdefault("normalize_returns", True)
        # kwargs.setdefault("return_range", (0., 1.))
        # kwargs.setdefault("normalize_observations", True)
        # kwargs.setdefault("observation_range", (-1., 1.))

        kwargs.setdefault("batch_size", 256)

        kwargs.setdefault("actor_lr", 1e-4)
        kwargs.setdefault("critic_lr", 1e-4)

        kwargs.setdefault("buffer_size", 1000000)

        kwargs.setdefault("action_noise", OrnsteinUhlenbeckActionNoise(
            mean=np.array([0.3]),
            sigma=0.3,
            theta=0.15
        ))

        print("Constructing DDPG agent with settings:")
        pprint.pprint(kwargs)

        # Construct the agent
        if load_policy:
            print("Load DDPG agent from ", load_policy_dir)
            agent = DDPG.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                              policy=CustomPolicyDDPG,
                              env=env_vec,
                              verbose=1,
                              tensorboard_log=logdir,
                              **kwargs
                              )
        else:
            agent = DDPG(
                policy=CustomPolicyDDPG,
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb
        )

    elif alg == 'ppo2':

        kwargs.setdefault("learning_rate", 1e-4)
        kwargs.setdefault("n_steps", 256 // num_parallel)
        kwargs.setdefault("ent_coef", 0.01)
        kwargs.setdefault("cliprange", 0.1)

        print("Constructing PPO2 agent with settings:")
        pprint.pprint(kwargs)

        if load_policy:
            print("Load PPO2 agent from ", load_policy_dir)
            agent = PPO2.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                              policy=CustomPolicyGeneral,
                              env=env_vec,
                              verbose=1,
                              tensorboard_log=logdir,
                              **kwargs
                              )
        else:
            agent = PPO2(
                policy=CustomPolicyGeneral,
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb,
            log_interval=10
        )

    elif alg == 'sac':

        # Default parameters for SAC
        kwargs.setdefault("learning_rate", 1e-4)
        kwargs.setdefault("buffer_size", 1000000)
        kwargs.setdefault("batch_size", 256)
        kwargs.setdefault("ent_coef", 'auto')
        # kwargs.setdefault("ent_coef", 'auto_0.1')

        kwargs.setdefault("action_noise", NormalActionNoise(
            mean=0,
            sigma=0.2,
        ))

        print("Constructing SAC agent with settings:")
        pprint.pprint(kwargs)

        # Construct the agent
        # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy
        # classes so we just use MlpPolicy and pass policy_kwargs

        if load_policy:
            print("Load SAC agent from ", load_policy_dir)
            kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu))
            agent = SAC.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                             env=env_vec,
                             verbose=1,
                             tensorboard_log=logdir,
                             **kwargs
                             )
        else:
            agent = SAC(
                policy='MlpPolicy',
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu),
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb
        )

    elif alg == 'td3':

        # Default parameters for SAC
        kwargs.setdefault("learning_rate", 1e-4)
        kwargs.setdefault("buffer_size", 1000000)
        kwargs.setdefault("batch_size", 256)
        kwargs.setdefault("gradient_steps", 1000)
        kwargs.setdefault("learning_starts", 10000)
        kwargs.setdefault("train_freq", 1000)

        # kwargs.setdefault("ent_coef", 'auto_0.1')

        kwargs.setdefault("action_noise", NormalActionNoise(
            mean=0,
            sigma=0.2,
        ))

        print("Constructing TD3 agent with settings:")
        pprint.pprint(kwargs)

        # Construct the agent
        # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy
        # classes so we just use MlpPolicy and pass policy_kwargs
        if load_policy:
            print("Load TD3 agent from ", load_policy_dir)
            kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu))
            agent = TD3.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                             env=env_vec,
                             verbose=1,
                             tensorboard_log=logdir,
                             **kwargs
                             )
        else:
            agent = TD3(
                policy='MlpPolicy',
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu),
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb
        )

    else:
        raise ValueError("Invalid alg: {}".format(alg))

    # Save final model
    agent.save(os.path.join(logdir, 'model.final.pkl'))

    print("Done")
Esempio n. 11
0
from util import log_dir, callback, AirstrikerDiscretizer, CustomRewardAndDoneEnv

# 環境の生成 (1)
env = retro.make(game='Airstriker-Genesis', state='Level1')
env = AirstrikerDiscretizer(env)  # 行動空間を離散空間に変換
env = CustomRewardAndDoneEnv(env)  # 報酬とエピソード完了の変更
env = StochasticFrameSkip(env, n=4, stickprob=0.25)  # スティッキーフレームスキップ
env = Downsample(env, 2)  # ダウンサンプリング
env = Rgb2gray(env)  # グレースケール
env = FrameStack(env, 4)  # フレームスタック
env = ScaledFloatFrame(env)  # 状態の正規化
env = Monitor(env, log_dir, allow_early_resets=True)
print('行動空間: ', env.action_space)
print('状態空間: ', env.observation_space)

# シードの指定
env.seed(0)
set_global_seeds(0)

# ベクトル化環境の生成
env = DummyVecEnv([lambda: env])

# モデルの生成
model = PPO2('CnnPolicy', env, verbose=1)

# モデルの学習
model.learn(total_timesteps=300000, callback=callback)

# モデルの保存
model.save('PPO2')
Esempio n. 12
0
total_timesteps_ = 3000000  # 3000000 for sac,  500000 for ppo2, 1500000 for ddpg
exp_num = "2"
tensorboard_log_name = algorithm + "_" + exp_num + "_" + env_name
tensorboard_log_dir = "./logs/"
# tensorboard --logdir=PPO2_1_Ex3_EKF_gyro-v0_1 --port=6006 --host=127.0.0.1
# tensorboard --logdir=sac_ekf_3_3 --port=6007 --host=127.0.0.2
model_save_name = tensorboard_log_name + "_model_" + exp_num

if algorithm == "PPO2":
    from itertools import cycle
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common import make_vec_env
    from stable_baselines import PPO2
    env = make_vec_env(env_name, n_envs=3)
    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 tensorboard_log=tensorboard_log_dir)
    if train:
        for i in range(model_num):
            model.learn(total_timesteps=total_timesteps_,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)
elif algorithm == "PPO1":
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines import PPO1
    env = gym.make(env_name)
    model = PPO1(MlpPolicy, env, verbose=1)
    if train:
        for i in range(model_num):
            model.learn(total_timesteps=total_timesteps_)
            model.save(model_save_name)
Esempio n. 13
0
# print(data_df.shape, 'after filling na')

# Note that I always use a copy of the original data to try it track step by step.
data_clean = data_df.copy()
# data_clean.head()
# data_clean.tail()

train = data_clean[1:2000]
# the index needs to start from 0
train = train.reset_index(drop=True)
# train.head()

#tensorboard --logdir ./single_stock_tensorboard/
env_train = DummyVecEnv([lambda: SingleStockEnv(train)])
model_ppo = PPO2('MlpPolicy',
                 env_train,
                 tensorboard_log="./single_stock_trading_2_tensorboard/")
model_ppo.learn(total_timesteps=100000, tb_log_name="run_aapl_ppo")
#model.save('AAPL_ppo_100k')

test = data_clean[2000:]
# the index needs to start from 0
test = test.reset_index(drop=True)

model = model_ppo
env_test = DummyVecEnv([lambda: SingleStockEnv(test)])
obs_test = env_test.reset()
print("==============Model Prediction===========")

# for i in range(len(test.index.unique())):
#     print("testing", i, "th")
Esempio n. 14
0
        else:
            bench.config[k] = config[k]
    return bench.get_environment()


# Experiment configuration
# Play 5D scenario
action_values = (3, 3, 3, 3, 3)
env_config = {
    "seed": 0,
    "action_values": action_values,
    "instance_set_path": "../instance_sets/sigmoid/sigmoid_5D3M_train.csv",
}

# Make environment
# To track rewards we use our wrapper (this is only for simplicity)
env = make_sigmoid(env_config)
env = PerformanceTrackingWrapper(env)

# Make simple PPO policy
model = PPO2("MlpPolicy", env)

# Run for 10 steps
model.learn(total_timesteps=200)

performance = env.get_performance()[0]
for i in range(len(performance)):
    print(
        f"Episode {i+1}/{len(performance)}...........................................Reward: {performance[i]}"
    )
Esempio n. 15
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from FrankaGymEnvironment import CustomEnv

my_signal_rate = 100
my_signal_repetitions = 15
my_step_limit = 24

env = CustomEnv(signal_rate= my_signal_rate, signal_repetitions= my_signal_repetitions, step_limit= my_step_limit)
# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
# env = DummyVecEnv([lambda: env])

my_learning_rate = 0.003
timesteps = 160000
# Configure tensorflow using GPU
# Use tensorboard to show reward over time etc
model = PPO2(MlpPolicy, env, learning_rate= my_learning_rate, verbose=1, tensorboard_log="/home/ryuga/Documents/TensorBoardLogs") # defaults: learning_rate=2.5e-4,
model.learn(total_timesteps=timesteps)

name = "BALLS_franka_continuous_ppo2"  + str(my_learning_rate) + "_timesteps_" + str(timesteps)
model.save(name)

f = open("envparameters_" + name, "x")
f.write(str([my_signal_rate, my_signal_repetitions, my_step_limit]))
f.close()
Esempio n. 16
0
def run_model(is_train=True, model_name='rl_model'):
    df = pd.read_csv('./data/db.csv')
    df = df.sort_values('date')
    df = df.drop(columns='date')

    df = df.dropna().reset_index()
    print(df.isnull().sum().sum())

    # train, test = train_test_split(df, test_size=0.1)

    train = df[:int(0.9 * len(df))].reset_index()
    test = df[int(0.9 * len(df)):].reset_index()
    # The algorithms require a vectorized environment to run
    if is_train:
        env = DummyVecEnv([lambda: StockTradingEnv(train, train, 29, True)])
    else:
        env = DummyVecEnv([lambda: StockTradingEnv(test, train, 29, False)])

    if is_train and model_name == 'rl_rand_model':
        model = PPO2(RandomPolicy,
                     env,
                     verbose=11,
                     tensorboard_log="./log/rand_stock_tensorboard/")

    elif not is_train and model_name == 'rl_rand_model':
        #model = PPO2.load("./ckpt/rl_rand_model")
        model = PPO2(RandomPolicy,
                     env,
                     verbose=11,
                     tensorboard_log="./log/rand_stock_tensorboard/")

    elif is_train and model_name == 'rl_model':
        #model = PPO2(CustomPolicy, env, verbose=11, tensorboard_log="./log/ppo2_stock_tensorboard/")
        model = PPO2.load("./ckpt/rl_model", env=env)
    elif not is_train and model_name == 'rl_model':
        model = PPO2.load("./ckpt/rl_model", env=env)

    elif not is_train and model_name == 'hr_model':
        model = Heristic(env)
    elif is_train and model_name == 'hr_model':
        model = Heristic(env)

    elif not is_train and model_name == 'rnn_model':
        model = Baseline(env)
    elif is_train and model_name == 'rnn_model':
        model = Baseline(env)
    else:
        assert False

    for epoch in range(1):
        if model_name == 'rl_model' and is_train:
            obs = env.reset()
            model.learn(total_timesteps=100000)
            model.save("./ckpt/rl_model")
        obs = env.reset()
        success = []
        for i in range(len(test.loc[:, 'TROW_PRC'].values) - 30):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            success.append(info[0]['success_rate'])
            env.render()
        return success
Esempio n. 17
0
    actor_options = {
        'learning_rate': lr,
        'gamma': 1.,
        'verbose': 0,
        'n_steps': 100,
        'ent_coef': 0.,
        'max_grad_norm': 1e2,
    }

    description = ','.join(
        ['{}={}'.format(k, v) for k, v in actor_options.items()])
    description += ',num_env={},norm_obs={},norm_reward={}'.format(
        num_env, norm_obs, norm_reward)

    learning_options = {'total_timesteps': int(1e6)}

    # Wrap in a try statement to close the environment properly in case of keyboard interrupt.
    try:
        envs = [make_mujoco_env(env_name, 2) for _ in range(num_env)]
        # env = DummyVecEnv([lambda: env for env in envs])
        env = SubprocVecEnv([lambda: env for env in envs])
        env = VecNormalize(env, norm_obs=norm_obs, norm_reward=norm_reward)

        # Create the actor and learn
        actor_options['tensorboard_log'] = os.path.join(
            tensorboard_logdir, env_name)
        model = PPO2(MlpPolicy, env, **actor_options)
        # model = PPO2(MlpLstmPolicy, env, **actor_options)
        model.learn(**learning_options, tb_log_name=description)
    finally:
        env.close()
Esempio n. 18
0
    if args.load:
        print('Model Loaded')
        load_file = args.load
        env = DummyVecEnv([lambda: env])
        model = PPO2.load(load_file, env=env)

    else:
        print('Training')
        gamma = 0.9  # discount rate
        #self.epsilon_decay = 0.99
        learning_rate = 1e-4
        target_network_update_freq = 1000
        model = PPO2(MlpPolicy,
                     env,
                     verbose=0,
                     gamma=gamma,
                     noptepochs=8,
                     nminibatches=8,
                     learning_rate=learning_rate,
                     ent_coef=0.001,
                     tensorboard_log=tensorboard_log_dir)

    if args.evaluate:
        if not args.load:
            print('Load a model to evaluate')
        evaluate_policy(model, env, deterministic=False, n_eval_episodes=10)

    else:
        model.learn(total_timesteps=int(1e7), callback=callback)

    print('done simulation')
Esempio n. 19
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.env_checker import check_env

import numpy as np

from gym import spaces

# Since opponent_model wasn't set, this environment uses a random
# opponent.
random_env = gym.make('custom_gyms:tictac4-v0')
check_env(random_env)

# set n_cpu_tf_sess so that this will just run on 1 thread. We need this for determinism.
model = PPO2("MlpPolicy",
             random_env,
             verbose=False,
             learning_rate=0.0025,
             nminibatches=4,
             n_cpu_tf_sess=1,
             seed=1)

mean_reward, std_reward = evaluate_policy(model,
                                          random_env,
                                          n_eval_episodes=2,
                                          deterministic=True,
                                          render=True)
print(f'random opponent: mean reward: {mean_reward}, std reward {std_reward}')
Esempio n. 20
0
description_str = "trajectories1-124_e7_l7_5sec_gamma0.3_t0_r1.0_lin0.001_rot0.04"
log_dir = "../runs/tensorboard/ppo2_2019_07_03_trajectory002/"
continue_learning = False
# bagfiles = ["../resources/torque_trajectory_{0:03}.bag".format(i) for i in range(1, 125)]
bagfiles = ["../resources/torque_trajectory_001.bag"]
gazebo_env = GazeboEnv(0.1, 5.0, bagfiles, example_embodiments.panda_embodiment, example_embodiments.panda_embodiment)
env = DummyVecEnv([lambda: Monitor(gazebo_env, "../runs/monitor/", allow_early_resets=True)])

if continue_learning:
    model = PPO2.load("../runs/models/trajectory003_e7_l4_5sec_gamma0.3_t0_r1.0_lin0.001_rot0.01_best.pkl",
                      env=env,
                      tensorboard_log=log_dir)
else:
    model = PPO2(MlpPolicy, env, verbose=2,
                 tensorboard_log=log_dir,
                 gamma=0.3,
                 # n_steps=30,
                 # nminibatches=1
                 )

best_mean_reward, n_steps = -np.inf, 0


def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global n_steps, best_mean_reward
    # Print stats every 256 calls
    if (n_steps + 1) % 100 == 0:
Esempio n. 21
0
if not os.path.isdir(tensorboard_folder):
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: ActionMaskEnv()])

model = PPO2(get_policy(policy),
             env,
             verbose=0,
             nminibatches=1,
             tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=25000, tb_log_name='PPO2' + model_tag)

model.save(model_folder + "PPO2" + model_tag)
del model
model = PPO2.load(model_folder + "PPO2" + model_tag)

done = False
states = None
action_masks = []
obs = env.reset()

while not done:
    action, states = model.predict(obs, states, action_mask=action_masks)
Esempio n. 22
0
                                 ball_friction=0.3,
                                 ball_elasticity=1.5,
                                 max_cycles=125)
env = ss.color_reduction_v0(env, mode='B')
env = ss.resize_v0(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
env = ss.pettingzoo_env_to_vec_env_v0(env)
env = ss.concat_vec_envs_v0(env, 8, num_cpus=4, base_class='stable_baselines')

model = PPO2(CnnPolicy,
             env,
             verbose=3,
             gamma=0.99,
             n_steps=125,
             ent_coef=0.01,
             learning_rate=0.00025,
             vf_coef=0.5,
             max_grad_norm=0.5,
             lam=0.95,
             nminibatches=4,
             noptepochs=4,
             cliprange=0.2,
             cliprange_vf=1)
model.learn(total_timesteps=2000000)
model.save("policy")

# Rendering

env = pistonball_v3.env()
env = ss.color_reduction_v0(env, mode='B')
env = ss.resize_v0(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
Esempio n. 23
0
#p_quarks = dict(net_arch=[256, 256, dict(
#    vf=[256, 128], pi=[64])])


name = "Rev_i7_DISCRETE_DefNN_RndmBall_Phys006_ppo2_franka_LR_" + print_LR + "_timesteps_" + \
    str(timesteps) + "_srate_sreps_slimit_" + str(my_signal_rate) + \
    str(my_signal_repetitions) + str(my_step_limit) + "_joints_" + str(my_number_of_joints) + "_rdmBall_" + str(my_randomBall) + "_ballPos_" + str(my_ballPos)

#model = PPO2(MlpPolicy, env, policy_kwargs=p_quarks, learning_rate=my_learning_rate, verbose=1,
#             tensorboard_log="/media/ryuga/Shared Storage/TensorBoardLogs/NEW_DEEP_FRANKA5_RYZEN")  # defaults: learning_rate=2.5e-4,

policy = MlpPolicy  # if MlpLstmPolicy then nminibatches=1 # MlpPolicy
model = PPO2(policy,
             env,
             learning_rate=my_learning_rate,
             verbose=1,
             tensorboard_log=
             "/media/ryuga/Shared Storage/TensorBoardLogs/Rev_NEW_DEEP_FRANKA"
             )  # defaults: learning_rate=2.5e-4,

try:
    f = open("../Envparameters/envparameters_" + name, "x")
    f.write(
        str([
            my_signal_rate, my_signal_repetitions, my_step_limit, lr_start,
            lr_end, timesteps, my_number_of_joints, my_randomBall, my_ballPos
        ]))
    f.close()
except:
    print("envparameters couldn't be saved. They are:" + str([
        my_signal_rate, my_signal_repetitions, my_step_limit, lr_start, lr_end,
Esempio n. 24
0
env = DummyVecEnv([lambda: ProcessorEnv()])
# env = DummyVecEnv([lambda: ProcessorEnv(taskFile='data/dataset/CSV/0.csv')])

# np.random.seed(123)
# env.seed(123)
# print (env.get_attr('reward_range'))
# env.reward_range = env.get_attr('reward_range')
# env = Monitor(env, log_dir, allow_early_resets=True)

# Because we use parameter noise, we should use a MlpPolicy with layer normalization
if (resume):
    model = PPO2.load(models_dir + "ppo2_resetnew_noroundoff_1_expt8")
    model.set_env(env)
    print("RESUMED")
else:
    model = PPO2(MlpPolicy, env, verbose=0, learning_rate=learning_rate)
    print(float(1e-5) == 0.00001)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(env=env,
                                            check_freq=1000,
                                            log_dir=log_dir)
# Train the agent

try:
    model.learn(total_timesteps=int(time_steps), callback=callback)
    model.save(models_dir + model_name)
except KeyboardInterrupt:
    model.save(models_dir + model_name + "_abort")
finally:
    mean_episode_reward = env.get_attr('mean_episode_reward')
    print(mean_episode_reward)
Esempio n. 25
0
        super().__init__(sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         n_lstm,
                         reuse,
                         net_arch=[8, 'lstm',
                                   dict(vf=[5, 10], pi=[10])],
                         layer_norm=True,
                         feature_extraction="mlp",
                         **_kwargs)


# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: QtradeEnv()])

model = PPO2(CustomLSTMPolicy, env, verbose=1, nminibatches=1)
model.learn(total_timesteps=50000)
model.save('ppo2_mlplnlstm')

del model
model = PPO2.load('ppo2_mlplnlstm', env=env)

obs = env.reset()
for i in range(20000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
Esempio n. 26
0
def main(args):
    envconfig_string = args.envconfig
    custom_envconfig = _preprocess_custom_envconfig(
        args.envconfig) if args.envconfig is not None else {}
    env_id = 'gym_auv:' + args.env
    env_name = env_id.split(':')[-1] if ':' in env_id else env_id
    envconfig = gym_auv.SCENARIOS[env_name][
        'config'] if env_name in gym_auv.SCENARIOS else {}
    envconfig.update(custom_envconfig)

    NUM_CPU = 8
    EXPERIMENT_ID = str(int(time())) + args.algo.lower()
    model = {
        'ppo': PPO2,
        'ddpg': DDPG,
        'td3': TD3,
        'a2c': A2C,
        'acer': ACER,
        'acktr': ACKTR
    }[args.algo.lower()]

    if args.mode == 'play':
        agent = model.load(args.agent) if args.agent is not None else None
        envconfig_play = envconfig.copy()
        envconfig_play['show_indicators'] = True
        #envconfig_play['autocamera3d'] = False
        env = create_env(env_id,
                         envconfig_play,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot,
                         verbose=True)
        print('Created environment instance')

        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            args.video_dir,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        print(args.video_dir, args.video_name)
        play_scenario(env, recorded_env, args, agent=agent)
        recorded_env.env.close()

    elif (args.mode == 'enjoy'):
        agent = model.load(args.agent)
        # params = agent.get_parameters()
        # policy_weights = [
        #     params['model/pi_fc0/w:0'],
        #     params['model/pi_fc1/w:0'],
        #     params['model/pi/w:0']
        # ]
        # policy_biases = [
        #     params['model/pi_fc0/b:0'],
        #     params['model/pi_fc1/b:0'],
        #     params['model/pi/b:0']
        # ]
        # for param in params:
        #     print(param, params[param].shape)

        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(video_folder, exist_ok=True)

        env = create_env(env_id,
                         envconfig,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot)
        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            video_folder,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        obs = recorded_env.reset()
        state = None
        done = [False for _ in range(vec_env.num_envs)]
        for _ in range(args.recording_length):
            if args.recurrent:
                action, _states = agent.predict(
                    observation=obs,
                    state=state,
                    mask=done,
                    deterministic=not args.stochastic)
                state = _states
            else:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
            obs, reward, done, info = recorded_env.step(action)
            recorded_env.render()
        recorded_env.close()

    elif (args.mode == 'train'):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        os.makedirs(scenario_folder, exist_ok=True)
        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        recording_length = 8000
        os.makedirs(video_folder, exist_ok=True)
        agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(agent_folder, exist_ok=True)
        tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard',
                                       args.env, EXPERIMENT_ID)
        tensorboard_port = 6006

        if (args.nomp or model == DDPG or model == TD3):
            num_cpu = 1
            vec_env = DummyVecEnv(
                [lambda: create_env(env_id, envconfig, pilot=args.pilot)])
        else:
            num_cpu = NUM_CPU
            vec_env = SubprocVecEnv([
                make_mp_env(env_id, i, envconfig, pilot=args.pilot)
                for i in range(num_cpu)
            ])

        if (args.agent is not None):
            agent = model.load(args.agent)
            agent.set_env(vec_env)
        else:
            if (model == PPO2):
                if args.recurrent:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 1,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-3,
                    }

                    class CustomLSTMPolicy(MlpLstmPolicy):
                        def __init__(self,
                                     sess,
                                     ob_space,
                                     ac_space,
                                     n_env,
                                     n_steps,
                                     n_batch,
                                     n_lstm=256,
                                     reuse=False,
                                     **_kwargs):
                            super().__init__(sess,
                                             ob_space,
                                             ac_space,
                                             n_env,
                                             n_steps,
                                             n_batch,
                                             n_lstm,
                                             reuse,
                                             net_arch=[
                                                 256, 256, 'lstm',
                                                 dict(vf=[64], pi=[64])
                                             ],
                                             **_kwargs)

                    agent = PPO2(CustomLSTMPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams)
                else:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 32,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-4,
                    }
                    #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64])
                    #policy_kwargs = dict(net_arch=[64, 64, 64])
                    layers = [256, 128, 64]
                    #layers = [64, 64]
                    policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                    agent = PPO2(MlpPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams,
                                 policy_kwargs=policy_kwargs)
            elif (model == DDPG):
                hyperparams = {
                    'memory_limit':
                    1000000,
                    'normalize_observations':
                    True,
                    'normalize_returns':
                    False,
                    'gamma':
                    0.98,
                    'actor_lr':
                    0.00156,
                    'critic_lr':
                    0.00156,
                    'batch_size':
                    256,
                    'param_noise':
                    AdaptiveParamNoiseSpec(initial_stddev=0.287,
                                           desired_action_stddev=0.287)
                }
                agent = DDPG(LnMlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log,
                             **hyperparams)
            elif (model == TD3):
                action_noise = NormalActionNoise(mean=np.zeros(2),
                                                 sigma=0.1 * np.ones(2))
                agent = TD3(stable_baselines.td3.MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            action_noise=action_noise)
            elif model == A2C:
                hyperparams = {
                    'n_steps': 5,
                    'gamma': 0.995,
                    'ent_coef': 0.00001,
                    'learning_rate': 2e-4,
                }
                layers = [64, 64]
                policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                agent = A2C(MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            **hyperparams,
                            policy_kwargs=policy_kwargs)
            elif model == ACER:
                agent = ACER(MlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log)
            elif model == ACKTR:
                agent = ACKTR(MlpPolicy,
                              vec_env,
                              verbose=True,
                              tensorboard_log=tensorboard_log)

        print('Training {} agent on "{}"'.format(args.algo.upper(), env_id))

        n_updates = 0
        n_episodes = 0

        def callback(_locals, _globals):
            nonlocal n_updates
            nonlocal n_episodes

            sys.stdout.write('Training update: {}\r'.format(n_updates))
            sys.stdout.flush()

            _self = _locals['self']
            vec_env = _self.get_env()

            class Struct(object):
                pass

            report_env = Struct()
            report_env.history = []
            report_env.config = envconfig
            report_env.nsensors = report_env.config[
                "n_sensors_per_sector"] * report_env.config["n_sectors"]
            report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1)
            report_env.last_episode = vec_env.get_attr('last_episode')[0]
            report_env.config = vec_env.get_attr('config')[0]
            report_env.obstacles = vec_env.get_attr('obstacles')[0]

            env_histories = vec_env.get_attr('history')
            for episode in range(max(map(len, env_histories))):
                for env_idx in range(len(env_histories)):
                    if (episode < len(env_histories[env_idx])):
                        report_env.history.append(
                            env_histories[env_idx][episode])
            report_env.episode = len(report_env.history) + 1

            total_t_steps = _self.get_env().get_attr(
                'total_t_steps')[0] * num_cpu
            agent_filepath = os.path.join(agent_folder,
                                          str(total_t_steps) + '.pkl')

            if model == PPO2:
                recording_criteria = n_updates % 10 == 0
                report_criteria = True
                _self.save(agent_filepath)
            elif model == A2C or model == ACER or model == ACKTR:
                save_criteria = n_updates % 100 == 0
                recording_criteria = n_updates % 1000 == 0
                report_criteria = True
                if save_criteria:
                    _self.save(agent_filepath)
            elif model == DDPG or model == TD3:
                save_criteria = n_updates % 10000 == 0
                recording_criteria = n_updates % 50000 == 0
                report_criteria = report_env.episode > n_episodes
                if save_criteria:
                    _self.save(agent_filepath)

            if report_env.last_episode is not None and len(
                    report_env.history) > 0 and report_criteria:
                try:
                    #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode))
                    gym_auv.reporting.report(report_env,
                                             report_dir=figure_folder)
                    #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode)))
                except OSError as e:
                    print("Ignoring reporting OSError:")
                    print(repr(e))

            if recording_criteria:
                if args.pilot:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, args.pilot, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                else:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                subprocess.Popen(cmd)

            n_episodes = report_env.episode
            n_updates += 1

        agent.learn(total_timesteps=1500000,
                    tb_log_name='log',
                    callback=callback)

    elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        agent = PPO2.load(args.agent)

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))
            for valuedict in valuegrid:
                customconfig = envconfig.copy()
                customconfig.update(valuedict)
                env = create_env(env_id,
                                 envconfig,
                                 test_mode=True,
                                 pilot=args.pilot)
                valuedict_str = '_'.join(
                    (key + '-' + str(val) for key, val in valuedict.items()))

                print('Running {} test for {}...'.format(
                    args.mode, valuedict_str))

                if args.mode == 'policyplot':
                    gym_auv.reporting.plot_actions(env,
                                                   agent,
                                                   fig_dir=figure_folder,
                                                   fig_prefix=valuedict_str)
                elif args.mode == 'vectorfieldplot':
                    gym_auv.reporting.plot_vector_field(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)
                elif args.mode == 'streamlinesplot':
                    gym_auv.reporting.plot_streamlines(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)

        else:
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             pilot=args.pilot)
            with open(os.path.join(figure_folder, 'config.json'), 'w') as f:
                json.dump(env.config, f)

            if args.mode == 'policyplot':
                gym_auv.reporting.plot_actions(env,
                                               agent,
                                               fig_dir=figure_folder)
            elif args.mode == 'vectorfieldplot':
                gym_auv.reporting.plot_vector_field(env,
                                                    agent,
                                                    fig_dir=figure_folder)
            elif args.mode == 'streamlinesplot':
                gym_auv.reporting.plot_streamlines(env,
                                                   agent,
                                                   fig_dir=figure_folder)

        print('Output folder: ', figure_folder)

    elif args.mode == 'test':
        figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env,
                                     EXPERIMENT_ID)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        video_folder = os.path.join(figure_folder, 'videos')
        os.makedirs(figure_folder, exist_ok=True)
        os.makedirs(scenario_folder, exist_ok=True)
        os.makedirs(video_folder, exist_ok=True)

        if not args.onlyplot:
            agent = model.load(args.agent)

        def create_test_env(video_name_prefix, envconfig=envconfig):
            print('Creating test environment: ' + env_id)
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             render_mode=args.render if args.video else None,
                             pilot=args.pilot)
            vec_env = DummyVecEnv([lambda: env])
            if args.video:
                video_length = min(500, args.recording_length)
                recorded_env = VecVideoRecorder(vec_env,
                                                video_folder,
                                                record_video_trigger=lambda x:
                                                (x % video_length) == 0,
                                                video_length=video_length,
                                                name_prefix=video_name_prefix)
            active_env = recorded_env if args.video else vec_env

            return env, active_env

        failed_tests = []

        def run_test(id,
                     reset=True,
                     report_dir=figure_folder,
                     scenario=None,
                     max_t_steps=None,
                     env=None,
                     active_env=None):
            nonlocal failed_tests

            if env is None or active_env is None:
                env, active_env = create_test_env(video_name_prefix=args.env +
                                                  '_' + id)

            if scenario is not None:
                obs = active_env.reset()
                env.load(args.scenario)
                print('Loaded', args.scenario)
            else:
                if reset:
                    obs = active_env.reset()
                else:
                    obs = env.observe()

            gym_auv.reporting.plot_scenario(env,
                                            fig_dir=scenario_folder,
                                            fig_postfix=id,
                                            show=args.onlyplot)
            if args.onlyplot:
                return
            cumulative_reward = 0
            t_steps = 0
            if max_t_steps is None:
                done = False
            else:
                done = t_steps > max_t_steps

            while not done:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
                obs, reward, done, info = active_env.step(action)
                if args.video:
                    active_env.render()
                t_steps += 1
                cumulative_reward += reward[0]
                report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format(
                    id, t_steps, cumulative_reward, info[0]['progress'])
                sys.stdout.write(report_msg)
                sys.stdout.flush()

                if args.save_snapshots and t_steps % 30 == 0 and not done:
                    env.save_latest_episode(save_history=False)
                    for size in (20, 50, 100, 200, 300, 400, 500):
                        gym_auv.reporting.plot_trajectory(
                            env,
                            fig_dir=scenario_folder,
                            fig_prefix=(args.env + '_t_step_' + str(t_steps) +
                                        '_' + str(size) + '_' + id),
                            local=True,
                            size=size)
                elif done:
                    gym_auv.reporting.plot_trajectory(env,
                                                      fig_dir=scenario_folder,
                                                      fig_prefix=(args.env +
                                                                  '_' + id))

            env.close()

            gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1)
            #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id))
            #env.save(os.path.join(scenario_folder, id))
            if env.collision:
                failed_tests.append(id)
                with open(os.path.join(figure_folder, 'failures.txt'),
                          'w') as f:
                    f.write(', '.join(map(str, failed_tests)))

            return copy.deepcopy(env.last_episode)

        print('Testing scenario "{}" for {} episodes.\n '.format(
            args.env, args.episodes))
        report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format(
            'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions',
            'CT-Error [m]', 'H-Error [deg]')
        print(report_msg_header)
        print('-' * len(report_msg_header))

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))

        if args.scenario:
            if args.testvals:
                episode_dict = {}
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = -np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                        episode_dict[valuedict_str] = [last_episode, colorval]
                print('Plotting all')
                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=scenario_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)

            else:
                run_test("ep0", reset=True, scenario=args.scenario)

        else:
            if args.testvals:
                episode_dict = {}
                agent_index = 1
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                    episode_dict['Agent ' +
                                 str(agent_index)] = [last_episode, colorval]
                    agent_index += 1

                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=figure_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)
            else:
                env, active_env = create_test_env(video_name_prefix=args.env)
                for episode in range(args.episodes):
                    run_test('ep' + str(episode),
                             env=env,
                             active_env=active_env)

        if args.video and active_env:
            active_env.close()
Esempio n. 27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Example on how to use the 'Pendulum' OpenAI Gym environments in PRL using the `stable_baselines` library.
"""

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from pyrobolearn.envs import gym  # this is a thin wrapper around the gym library

# create env, state, and action from gym
env = gym.make('Pendulum-v0')
state, action = env.state, env.action
print("State and action space: {} and {}".format(state.space, action.space))

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: env])

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Esempio n. 28
0
def main():
    #criando diretorio
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)
    #criando envs
    envRoll = gym.make('gym_foo:DroneRoll-v0')
    envRoll = Monitor(envRoll, log_dir)
    modelRoll = PPO2(MlpPolicy,
                     envRoll,
                     gamma=0.99,
                     n_steps=2048,
                     ent_coef=0.0,
                     learning_rate=3e-4,
                     lam=0.95,
                     nminibatches=32,
                     noptepochs=10,
                     cliprange=0.2,
                     verbose=1)
    envPitch = gym.make('gym_foo:DronePitch-v0')
    envPitch = Monitor(envPitch, log_dir)

    modelPitch = PPO2(MlpPolicy,
                      envPitch,
                      gamma=0.99,
                      n_steps=2048,
                      ent_coef=0.0,
                      learning_rate=3e-4,
                      lam=0.95,
                      nminibatches=32,
                      noptepochs=10,
                      cliprange=0.2,
                      verbose=1)
    envYaw = gym.make('gym_foo:DroneYaw-v0')
    envYaw = Monitor(envYaw, log_dir)

    modelYaw = PPO2(MlpPolicy,
                    envYaw,
                    gamma=0.99,
                    n_steps=2048,
                    ent_coef=0.0,
                    learning_rate=3e-4,
                    lam=0.95,
                    nminibatches=32,
                    noptepochs=10,
                    cliprange=0.2,
                    verbose=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #treinando
    time_steps = 2e6
    modelRoll.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Roll")
    plt.show()

    modelPitch.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Pitch")
    plt.show()

    modelYaw.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Yaw")
    plt.show()

    #salvando modelos
    modelRoll.save("Drone_Roll_PPO_001")
    modelPitch.save("Drone_Pitch_PPO_001")
    modelYaw.save("Drone_Yaw_PPO_001")

    #Load modelo
    #model = PPO2.load("Drone_Roll_PPO_0.01")

    #testando gerando resposta no tempo
    T = [0]
    # Loop de teste
    t = 0
    #obs = env.reset()
    obsRoll = envRoll.reset()
    obsPitch = envPitch.reset()
    obsYaw = envYaw.reset()
    Roll = [envRoll.state[0]]
    Pitch = [envPitch.state[0]]
    Yaw = [envYaw.state[0]]

    #loop de simulação
    while t < 10:  # ate 10 segundos

        actionRoll, _states = modelRoll.predict(obsRoll)
        # Retrieve new state, reward, and whether the state is terminal
        obsRoll, reward, done, info = envRoll.step(actionRoll)
        Roll.append((180 / np.pi) * envRoll.state[0])

        actionPitch, _states = modelPitch.predict(obsPitch)
        # Retrieve new state, reward, and whether the state is terminal
        obsPitch, reward, done, info = envPitch.step(actionPitch)
        Pitch.append((180 / np.pi) * envPitch.state[0])

        actionYaw, _states = modelYaw.predict(obsYaw)
        # Retrieve new state, reward, and whether the state is terminal
        obsYaw, reward, done, info = envYaw.step(actionYaw)
        Yaw.append((180 / np.pi) * envYaw.state[0])

        t += 0.01
        T.append(t)

    #Plots
    plt.figure(1)
    plt.plot(T, Roll)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Roll')
    plt.xlabel('Time (seconds)')
    plt.title('Roll Response')
    plt.grid()
    plt.show()

    plt.figure(2)
    plt.plot(T, Pitch)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Pitch')
    plt.xlabel('Time (seconds)')
    plt.title('Pitch Response')
    plt.grid()
    plt.show()

    plt.figure(3)
    plt.plot(T, Yaw)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Yaw')
    plt.xlabel('Time (seconds)')
    plt.title('Yaw Response')
    plt.grid()
    plt.show()
Esempio n. 29
0
INITIAL_ACCOUNT_BALANCE = 50000

model_name = 'dqn'

# dataset loading
df_train = pd.read_csv('./data/SPY_training.csv')
df_train = df_train.sort_values('Date')
df_test = pd.read_csv('./data/SPY_test.csv')
df_test = df_test.sort_values('Date')

# training
env = DummyVecEnv([lambda: StockTradingEnv(df_train)])
model = PPO2(MlpPolicy,
             env,
             verbose=1,
             seed=42,
             n_cpu_tf_sess=1,
             tensorboard_log="./tensorboard/")
# kwargs = {'double_q': False, 'prioritized_replay': False, 'policy_kwargs': dict(dueling=False)}
# model = DQN(MlpPolicy, env, verbose=1, seed=42, n_cpu_tf_sess=1, tensorboard_log="./tensorboard/", **kwargs)
model.learn(total_timesteps=40000, log_interval=10)
# model.save(save_dir + model_name)

# del model
# model = DQN.load(save_dir + model_name)

# # testing (previous 5 days)
# env = DummyVecEnv([lambda: StockTradingEnv(df_test)])
# obs = env.reset()
# daily_profit = []
# buy_hold_profit = []
Esempio n. 30
0
import os
import matplotlib.pyplot as plt

from gym_minigrid.wrappers import *
from gym_minigrid.wrappers import FlatObsWrapper
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv

tensorboard_folder = '/root/code/stable_baselines/tensorboard/MiniGrid-Empty-16x16/'
model_folder = './models/MiniGrid-Empty-16x16/'
if not os.path.isdir(tensorboard_folder):
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

env = gym.make('MiniGrid-Empty-16x16-v0')
env = FlatObsWrapper(env)
model = PPO2('MlpPolicy',
             env,
             verbose=0,
             nminibatches=1,
             n_steps=128,
             tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=1000000, tb_log_name='PPO2')
model.save(model_folder + "PPO2")