Exemple #1
0
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
def main():
    retro.data.Integrations.add_custom_path(
        os.path.join(SCRIPT_DIR, "custom_integrations"))
    print("PokemonRed-GameBoy" in retro.data.list_games(
        inttype=retro.data.Integrations.ALL))
    env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL)
    print(env)

    print(env.action_space)
    time.sleep(3)

    env = make_vec_env(lambda: env, n_envs=1)
    # check_env(env, warn=True)
    time.sleep(3)

    model = DQN(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()

    env.close()
    def run_train(self):
        env = CustomEnv(self.path_planner, self.behavior_planner, event)
        env = make_vec_env(lambda: env, n_envs=1)
        model = None
        if self.event == Scenario.LANE_CHANGE:
            model = DQN(CustomLaneChangePolicy,
                        env,
                        verbose=1,
                        learning_starts=256,
                        batch_size=256,
                        exploration_fraction=0.9,
                        target_network_update_freq=100,
                        tensorboard_log=dir_path + '/Logs/')

        if self.event == Scenario.PEDESTRIAN:
            model = DQN(CustomPedestrianPolicy,
                        env,
                        verbose=1,
                        learning_starts=256,
                        batch_size=256,
                        exploration_fraction=0.9,
                        target_network_update_freq=100,
                        tensorboard_log=dir_path + '/Logs/Ped',
                        gamma=0.93,
                        learning_rate=0.0001)
        model.learn(total_timesteps=20000)
        model.save(MODEL_SAVE_PATH)
    def run_test(self):
        env = CustomEnv(self.path_planner, self.behavior_planner, event)
        env = make_vec_env(lambda: env, n_envs=1)
        if (self.event == Scenario.LANE_CHANGE):
            model = DQN.load(MODEL_LOAD_PATH)
        if (self.event == Scenario.PEDESTRIAN):
            model = DQN.load(MODEL_LOAD_PATH)
        obs = env.reset()
        count = 0
        success = 0
        while count < 500:
            done = False

            while not done:
                action, _ = model.predict(obs)

                print("Action taken:", RLDecision(action))
                obs, reward, done, info = env.step(action)
                # print("Reward",reward)
            count += 1
            if info[0]["success"]:
                success += 1
            print("Count ", count, "Success ", success, "Success Rate:",
                  success * 100 / float(count), "%")
        print("Success Rate ", success / count, success, count)
Exemple #5
0
 def model_free_policy(self, ne, n_epochs=1, train=True, load_model=False):
     if self.autoencoder is None:
         self.setup_autoencoder(ne.get_obs())
         assert (self.autoencoder) is not None
     if ne.autoencoder is None:
         ne.set_autoencoder(self.autoencode)
         ne.autoencoder = self.autoencode
     if train:
         fn = "models/model1.h5"
         self.mf_policy = PPO2(env=ne,
                               policy=MlpPolicy,
                               n_steps=40,
                               verbose=2,
                               noptepochs=10,
                               learning_rate=3e-4,
                               ent_coef=0.1,
                               gamma=0.1)
         if load_model:
             self.mf_policy.load(fn, env=make_vec_env(lambda: ne))
         else:
             self.mf_policy.learn(total_timesteps=n_epochs * 40)
             self.mf_policy.save(fn)
     encoded_obs = ne.rl_obs()
     return self.mf_policy.step([encoded_obs],
                                deterministic=True)[0].flatten()
Exemple #6
0
def _load(model_name):
	model = PPO2.load(model_name)
	env = make_vec_env('PointMassDense-%d-v1' %num_objs, 1, wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal'])
	env = VecVideoRecorder(env, osp.join(logger, "videos_3"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length)
	model.set_env(env)
	model.learn(total_timesteps=int(nIter), log_interval=100)
	# model.save(exp_name)
	model.save(model_name + "_new")
	env.close()
Exemple #7
0
def train():
    set_gpu()
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 1e8

    save_video_length = 200
    save_video_interval = 1000000
    file = open('sac_done.txt', 'w+')
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    n_actions = env.action_space.shape[-1]
    stddev = 0.2

    pool = multiprocessing.Pool(processes=4)
    for lr in [1e-5]:  #, 5e-4, 1e-5
        logger = osp.join(
            expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter),
                                         np.format_float_scientific(lr)))
        env = VecVideoRecorder(
            env,
            osp.join(logger, "videos"),
            record_video_trigger=lambda x: x % save_video_interval == 0,
            video_length=save_video_length)
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file))
        model = SAC(
            MlpPolicy,
            env,
            verbose=verbose,
            tensorboard_log=logger,
            learning_rate=lr,
            action_noise=action_noise,
        )
        model.learn(total_timesteps=int(nIter), log_interval=100)
        exp_name = expDir + "/%s/%s_%s" % (name,
                                           np.format_float_scientific(nIter),
                                           np.format_float_scientific(lr))
        model.save(exp_name)
        file.write(exp_name + '\n')
        env.close()
    file.close()
    pool.close()
    pool.join()
Exemple #8
0
def main(output_folder_path: Path):
    env = make_vec_env('roar-occu-map-e2e-v0')
    env.reset()
    model_params: dict = {
        "verbose": 1,
        "env": env,
        "n_steps": 100
        # "render": True,
    }
    model, callbacks = setup(model_params, output_folder_path)
    model = model.learn(total_timesteps=1e6,
                        callback=callbacks,
                        reset_num_timesteps=False)
Exemple #9
0
def test_custom_vec_env():
    """
    Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests.
    """
    monitor_dir = 'logs/test_make_vec_env/'
    env = make_vec_env('CartPole-v1',
                       n_envs=1,
                       monitor_dir=monitor_dir,
                       seed=0,
                       vec_env_cls=SubprocVecEnv,
                       vec_env_kwargs={'start_method': None})

    assert env.num_envs == 1
    assert isinstance(env, SubprocVecEnv)
    assert os.path.isdir('logs/test_make_vec_env/')
    # Kill subprocess
    env.close()
    # Cleanup folder
    shutil.rmtree(monitor_dir)

    # This should fail because DummyVecEnv does not have any keyword argument
    with pytest.raises(TypeError):
        make_vec_env('CartPole-v1', n_envs=1, vec_env_kwargs={'dummy': False})
Exemple #10
0
def record(exp):
    model = SAC.load(exp)
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos_2"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model.set_env(env)
    model.learn(total_timesteps=2000, log_interval=100)
    # model.save(expDir + "/%s/%d" %(name, nIter))
    env.close()
Exemple #11
0
def mk_env_agent(env_class, registered_model, params, gui=False):
    model = SAC.load(registered_model.source)

    params_fname = f'{registered_model.source}.json'  # FIXME
    with open(params_fname, 'r') as fp:
        loaded_params = json.load(fp)

    params = {**loaded_params, **params}  # merge, overriding loaded params
    env = make_vec_env(lambda: env_class(params['NJ'], params, gui=gui),
                       n_envs=1)

    model.set_env(env)
    env.env_method('set_render_info', {
        'name': registered_model.name,
        'version': registered_model.version
    })  # FIXME

    return env, model
Exemple #12
0
def train_stable_baselines(args):
	"""
		Trains with PPO2 on specified environment.

		Parameters:
			args - the arguments defined in main.

		Return:
			None
	"""
	# Import stable baselines
	from stable_baselines import PPO2
	from stable_baselines.common.callbacks import CheckpointCallback
	from stable_baselines.common.cmd_util import make_vec_env
	from stable_baselines.common.evaluation import evaluate_policy

	# Store hyperparameters and total timesteps to run by environment
	hyperparameters = {}
	total_timesteps = 0
	if args.env == 'Pendulum-v0':
		hyperparameters = {'n_steps': 2048, 'nminibatches': 32, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 10,
							'ent_coef': 0.0, 'learning_rate': 3e-4, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed}
		total_timesteps = 1005000
	elif args.env == 'BipedalWalker-v3':
		hyperparameters = {'n_steps': 2048, 'nminibatches': 32, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 10,
							'ent_coef': 0.001, 'learning_rate': 2.5e-4, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed}
		total_timesteps = 1405000
	elif args.env == 'LunarLanderContinuous-v2':
		hyperparameters = {'n_steps': 1024, 'nminibatches': 32, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4,
							'ent_coef': 0.01, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed}
		total_timesteps = 1005000
	elif args.env == 'MountainCarContinuous-v0':
		hyperparameters = {'n_steps': 256, 'nminibatches': 8, 'lam': 0.94, 'gamma': 0.99, 'noptepochs': 4,
							'ent_coef': 0.0, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed}
		total_timesteps = 405000

	# Create log dir
	log_dir = "/tmp/gym/"
	os.makedirs(log_dir, exist_ok=True)

	# Make the environment and model, and train
	env = make_vec_env(args.env, n_envs=1, monitor_dir=log_dir)
	model = PPO2('MlpPolicy', env, **hyperparameters)
	model.learn(total_timesteps)
Exemple #13
0
def sb_model_train(rl_manager):
    env = CustomEnv(rl_manager)
    env = make_vec_env(lambda: env, n_envs=1)
    model = DQN(CustomPolicy,
                env,
                verbose=1,
                learning_starts=256,
                batch_size=256,
                exploration_fraction=0.5,
                target_network_update_freq=10,
                tensorboard_log='./Logs/')
    # model = DQN(MlpPolicy, env, verbose=1, learning_starts=64,  target_network_update_freq=50, tensorboard_log='./Logs/')
    # model = DQN.load("DQN_Model_SimpleSim_30k",env=env,exploration_fraction=0.1,tensorboard_log='./Logs/')
    model.learn(total_timesteps=10000)
    # model = PPO2(MlpPolicy, env, verbose=1,tensorboard_log="./Logs/")
    # model.learn(total_timesteps=20000)
    model.save(dir_path + "/DQN_Model_SimpleSim")
    # sb_model_test(rl_manager)
    return
Exemple #14
0
def learn():
	# expDir = '/home/shivanik/lab/pointExp/state/'
	# verbose = 1
	# num_objs = 1
	# name = 'ppo2_%d' %num_objs
	# logger = osp.join(expDir, name, 'logs')
	# video_folder = osp.join(logger, 'videos')
	# nIter = 1e7
	# save_video_interval = 5000

	env = make_vec_env('PointMassDense-%d-v1' %num_objs, 1,  wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal'])
	env = VecVideoRecorder(env, video_folder,
                       record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length,
                       name_prefix="Video-{}")

	model = PPO2(MlpPolicy, env, verbose=verbose,
	            tensorboard_log=logger,)
	model.learn(total_timesteps=int(nIter))
	model.save(expDir + "/%s/%s" %(name, np.format_float_scientific(nIter)))
Exemple #15
0
def main():
    retro.data.Integrations.add_custom_path(
        os.path.join(SCRIPT_DIR, "custom_integrations"))
    print("PokemonRed-GameBoy" in retro.data.list_games(
        inttype=retro.data.Integrations.ALL))
    env = retro.make("PokemonRed-GameBoy",
                     inttype=retro.data.Integrations.ALL,
                     obs_type=retro.Observations.RAM,
                     use_restricted_actions=retro.Actions.DISCRETE
                     )  #, use_restricted_actions=retro.Actions.DISCRETE
    print(env)

    # print(env.action_space)

    vec_env = make_vec_env(lambda: env, n_envs=4)
    # time.sleep(3)

    model = A2C(MlpPolicy, vec_env, verbose=1)

    start_time = time.time()
    model.learn(total_timesteps=200000)
    print("TRAINING COMPLETE! Time elapsed: ", str(time.time() - start_time))

    print("Attempting to get first pokemon!")
    start_time = time.time()
    printed_done = False
    # sampled_info = False

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        # if not sampled_info:
        #     print("Info:\n", info, "\n</info>")
        #     sampled_info = True

        if dones and not printed_done:
            print("Success! time elapsed: ", str(time.time() - start_time))
            printed_done = True

    env.close()
    def create_env(n_envs, eval_env=False):
        if algo in ['a2c', 'acer', 'acktr', 'ppo2']:
            if n_envs > 1:
                env = SubprocVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            else:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs)
                ])
            env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)])
            if env_wrapper is not None:
                env = env_wrapper(env)
        elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1):
            raise ValueError(
                "Error: {} does not support multiprocessing!".format(algo))
        elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1):
            raise ValueError(
                "Error: {} uses MPI for multiprocessing!".format(algo))
        else:
            env = make_vec_env(env_id,
                               n_envs=n_envs,
                               seed=args.seed,
                               monitor_dir=monitor_path,
                               wrapper_class=env_wrapper,
                               env_kwargs=env_kwargs)

        if args.normalize:  # choose from multiple options
            # env = VecNormalize(env, clip_obs=np.inf)
            env = VecNormalize(env, norm_reward=False, clip_obs=np.inf)
            # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs)
        return env
Exemple #17
0
def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class):
    env = make_vec_env(env_id,
                       n_envs,
                       vec_env_cls=vec_env_cls,
                       wrapper_class=wrapper_class,
                       monitor_dir=None,
                       seed=0)

    assert env.num_envs == n_envs

    if vec_env_cls is None:
        assert isinstance(env, DummyVecEnv)
        if wrapper_class is not None:
            assert isinstance(env.envs[0], wrapper_class)
        else:
            assert isinstance(env.envs[0], Monitor)
    else:
        assert isinstance(env, SubprocVecEnv)
    # Kill subprocesses
    env.close()
Exemple #18
0
def sb_model_test(rl_manager):
    env = CustomEnv(rl_manager)
    env = make_vec_env(lambda: env, n_envs=1)
    model = DQN.load(dir_path + "/DQN_Model_SimpleSim_30k")
    obs = env.reset()
    count = 0
    success = 0
    while count < 100:
        done = False
        print("Count ", count, "Success ", success)
        while not done:
            action, _ = model.predict(obs)

            print(action)
            obs, reward, done, info = env.step(action)
        count += 1
        if reward == 5:
            success += 1
    print("Success Rate ", success / count, success, count)
    rl_manager.finish = True
Exemple #19
0
def recieve(sid, data):

    global done
    global reward
    global maxactions
    jsonInput = json.loads(data)
    maxactions = jsonInput['maxactions']
    trainepisodes = jsonInput['trainepisodes']
    evalepisodes = jsonInput['evalepisodes']
    totalepisodes = trainepisodes + evalepisodes

    env = UnrealEnvWrap()
    # wrap it
    env = make_vec_env(lambda: env, n_envs=1)

    # Train the agent with different algorityhms from stable baselines

    #model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./DQN_newobservations/")
    model = DQN(MlpPolicy, env, verbose=1)
    #model = A2C(MlpPolicy, env, verbose=1, tensorboard_log="./A2C_newobservations/")
    #model = A2C(MlpPolicy, env, verbose=1)
    print("Agent training in process...")
    model.learn(total_timesteps=trainepisodes)

    # Test the trained agent, (currently not needed, all testing occurs in Unreal itself)
    env.render(mode='console')
    #env.render()

    obs = env.reset()
    print("Training complete, Starting Evaluation of Trained Model:")
    intaction = 0
    #Begin strategic behvaior
    for step in range(evalepisodes):
        action, _ = model.predict(obs, deterministic=True)
        intaction = action[0]
        print("Action: ", intaction)
        obs, reward, done, info = env.step(action)
        print('obs=', obs, 'reward=', reward, 'done=', done)

    sio.disconnect(sid)
Exemple #20
0
def train(agent=None):
    weights = {'fr': 0.3, 'fl': 20, 'fk': 20}
    depth, width, move_dist, plan_dist = 3, 3, 3, 3
    mkenv = lambda: Env(depth, width, move_dist, plan_dist,
                        max_steps=20, weights=weights,
                        obstacle_pct=0.1)

    eval_callback = EvalCallback(mkenv(),
                             best_model_save_path='logs/models',
                             log_path='logs', eval_freq=1_000,
                             deterministic=True, render=False)

    vecenv = make_vec_env(mkenv, 32, monitor_dir='logs/training')
    if agent:
        agent.set_env(vecenv)
    else:
        hparams = dict(n_steps=64, nminibatches=64, gamma=0.90,
                       learning_rate=2e-5, ent_coef=0.01,
                       cliprange=0.4, noptepochs=25, lam=0.99)
        agent = PPO2('MlpPolicy', vecenv, verbose=True, **hparams)
    agent.learn(1_000_000, callback=eval_callback)
    agent.save('logs/models/final')
    vecenv.close()
    return agent
Exemple #21
0
#! /usr/bin/env python

import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import env_yaw
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy

env = gym.make("Yaw-v0")
env = make_vec_env(lambda: env, n_envs=1)

# model = ACKTR.load("models/acktr_goleft", env=env)
model = ACKTR('MlpPolicy', env, verbose=1)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
Exemple #22
0
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
#import tensorflow as tf

from stable_baselines import PPO2, logger
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines.common.policies import MlpPolicy, CnnLstmPolicy, CnnLnLstmPolicy
import gym
#
from stable_baselines.ppo2.cppo2 import CPPO2

import numpy as np
env = make_vec_env('CartPole-v1', 1, 17)
model_random = PPO2(MlpPolicy,
                    env,
                    verbose=1,
                    cliprange=0.1,
                    seed=17,
                    n_cpu_tf_sess=1)  #, previous_model="previous.zip")
model_random.save("random")
#model = CPPO2(MlpPolicy, env, verbose=1, previous_model="random.zip", cliprange=0.01)
#model = model_random
#model = PPO2.load("random.zip", env=env)
model = CPPO2.load("random.zip", previous_model_path="random.zip", env=env)
model.learn(total_timesteps=4000)
# model.save("continuous_from_random")

sum_rewards = 0
episode_rewards = []
obs = env.reset()
Exemple #23
0
    def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"):
        self.agent_name = agent_name

        #self.env = LearningRocket(visualize=False)
        #self.env = NormalizeActionWrapper(self.env)

        #self.eval_env = LearningRocket(visualize=True)
        #self.eval_env = NormalizeActionWrapper(self.eval_env)

        #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)])
        self.env = make_vec_env(
            LearningRocket, n_envs=16
        )  #[lambda: LearningRocket(visualize=False) for i in range(16)]))
        #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)]))
        self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True),
                                     n_envs=1)
        #self.eval_env = VecNormalize(self.eval_env)
        self.eval_callback = EvalCallback(self.eval_env,
                                          best_model_save_path='Agent007',
                                          log_path='./logs/',
                                          eval_freq=10000,
                                          deterministic=True,
                                          render=False,
                                          n_eval_episodes=1)
        kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300])
        #check_env(self.env, warn=True)
        """
        if algorithm == "SAC":
            if load is True:
                self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/")
                #self.model.ent_coef=0.2
            else:
                self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5)
            print("Trainer Set for SAC")
        """
        if algorithm == "TD3":
            n_actions = self.env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            if load is True:
                self.model = TD3.load(agent_name,
                                      env=self.env,
                                      tensorboard_log="./rocket_tensorboard/")
                #file = open('replay_buffer', 'rb')
                #self.model.replay_buffer = pickle.load(file)
                #file.close()
            else:
                self.model = TD3(MlpPolicy,
                                 self.env,
                                 action_noise=action_noise,
                                 batch_size=768,
                                 gamma=0.95,
                                 learning_rate=1e-4,
                                 learning_starts=20000,
                                 verbose=1,
                                 tensorboard_log="./rocket_tensorboard/",
                                 policy_kwargs=dict(layers=[400, 300]))
            print("Trainer Set for TD3")
        elif algorithm == "PPO2":
            if load is True:
                self.model = PPO2.load(agent_name,
                                       env=self.env,
                                       tensorboard_log="./rocket_tensorboard/")
                self.eval_env = VecNormalize.load(self.agent_name + "vEnv",
                                                  self.eval_env)
                #self.eval_env.clip_obs = 500
                #self.env = VecNormalize(self.env)
                self.env = VecNormalize.load(self.agent_name + "vEnv",
                                             self.env)
                #self.env.clip_obs = 500
                #self.env.norm_obs = False
                #self.eval_env.norm_obs = False
            else:
                self.model = PPO2(PPOMlpPolicy,
                                  self.env,
                                  n_steps=1024,
                                  nminibatches=32,
                                  lam=0.98,
                                  gamma=0.999,
                                  noptepochs=4,
                                  ent_coef=0.01,
                                  verbose=1,
                                  tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs=dict(layers=[400, 300]))
                self.eval_env = VecNormalize(self.eval_env)
                self.env = VecNormalize(self.env)
                #self.eval_env.clip_obs = 500
                #self.env.clip_obs = 500
                #self.env.norm_obs=False
                #self.eval_env.norm_obs=False

                print("Trainer set for PPO2. I am speed.")
Exemple #24
0
import gym
from gym.spaces import MultiDiscrete
from stable_baselines import PPO2, A2C
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.callbacks import BaseCallback, CheckpointCallback, EveryNTimesteps
from curiosity_mask.util import create_dummy_action_mask as mask
from curiosity_mask.util import set_action_mask_gait as gait_mask
from curiosity_mask.ui import UI

from transitions import Machine

#env = gym.make('Acrobot-v1')
env = make_vec_env('Acrobot-v1', n_envs=40)


class Balance(object):
    def __init__(self):
        self.action_mask = []
        self.num_timesteps = None

    def is_sufficient_torque(self, event):
        #return False # abs(event.kwargs.get('torque')) > 9.8
        #if math.cos(event.kwargs.get('angle_1')) < -0.7:  # Torso link is lifted high
        #    sys.exit()
        return math.cos(event.kwargs.get('angle_1')) < -0.7

    def back_to_swing(self, event):
        return False
Exemple #25
0
# for plotting results:
from stable_baselines import results_plotter
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results

# creating environment
env = gym.make("Witches-v0")

log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = Monitor(env, log_dir)

# wrap it
env_vec = make_vec_env(lambda: env, n_envs=60)

# If the environment don't follow the interface, an error will be thrown
#check_env(env, warn=True)

model = PPO2('MlpLstmPolicy', env_vec, verbose=1)

time_steps = 1e8
model.learn(int(time_steps))

# export model as onnx:
#1. export params

#2. load params in pytorch:
model.save("tmp/witches")
Exemple #26
0
    def __init__(self):
        rospy.init_node('train_node', anonymous=True)

        env = gym.make("Yaw-v0")
        env = make_vec_env(lambda: env, n_envs=1)
        model = DQN('MlpPolicy', env, verbose=1).learn(1000)
Exemple #27
0
    ALGO = PPO2

    # We will create one environment to evaluate the agent on
    eval_env = gym.make(env_id)

    # DummyVecEnv vs SubprocVecEnv
    reward_averages = []
    reward_std = []
    training_times = []
    total_procs = 0
    for n_procs in PROCESSES_TO_TEST:
        total_procs += n_procs
        print('Running for n_procs = {}'.format(n_procs))
        # Here we are using only one process even for n_env > 1
        # this is equivalent to DummyVecEnv([make_env(env_id, i + total_procs) for i in range(n_procs)])
        train_env = make_vec_env(env_id, n_envs=n_procs)

        rewards = []
        times = []

        for experiment in range(NUM_EXPERIMENTS):
            # it is recommended to run several experiments due to variability in results
            train_env.reset()
            model = ALGO('MlpPolicy', train_env, verbose=0)
            start = time.time()
            model.learn(total_timesteps=TRAIN_STEPS)
            times.append(time.time() - start)
            mean_reward, _ = evaluate_policy(model,
                                             eval_env,
                                             n_eval_episodes=EVAL_EPS)
            rewards.append(mean_reward)
Exemple #28
0
import time

import gym
import numpy as np

from stable_baselines import ACKTR
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines.common import set_global_seeds
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.cmd_util import make_vec_env

# By default, we use a DummyVecEnv as it is usually faster (cf doc)

env_id = "CartPole-v1"
num_cpu = 4  # Number of processes to use
vec_env = make_vec_env(env_id, n_envs=num_cpu)

model = ACKTR('MlpPolicy', vec_env, verbose=0)
    )
    env.reset()

    model = PPO2.load('../data/pretrained_models/controlTableLine/PPO')

    for _ in range(video_length + 1):
        action, _ = model.predict(obs, deterministic=True)
        obs, _, _, _ = env.step(action)
    env.close()

elif mode == 'gif':
    import imageio
    from stable_baselines.common.cmd_util import make_vec_env

    images = []
    env = make_vec_env(controlTableLine, n_envs=1)
    model = PPO2.load('../data/pretrained_models/controlTableLine/PPO', env)
    obs = model.env.reset()
    img = model.env.render(mode='rgb_array')
    for i in range(1200):
        images.append(img)
        action, _ = model.predict(obs, deterministic=True)
        obs, _, _, _ = model.env.step(action)
        img = model.env.render(mode='rgb_array')
    imageio.mimsave(
        '../data/videos/PPO_controlTableLine.gif',
        [np.array(img) for i, img in enumerate(images) if i % 2 == 0],
        fps=29)

else:
    show_env = controlTableLine()
Exemple #30
0
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4,
          n_steps=128, peer=0., scheduler=None, individual=False, repeat=1):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update.
        For recurrent policies, the number of environments run in parallel
        should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment
        per update (i.e. batch size is n_steps * n_env where n_env is
        number of environment copies running in parallel)
    """

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]

    is_atari = 'NoFrameskip' in env_id
    make_env = lambda: VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) if is_atari \
        else make_vec_env(env_id, n_envs, seed)
    print(make_env)

    models = {
        "A": PPO2(
            policy=policy, policy_kwargs={'view': 'even'}, n_steps=n_steps,
            env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, 
            noptepochs=4, ent_coef=.01, learning_rate=2.5e-4,
            cliprange=lambda f: f * 0.1, verbose=1),
        "B": PPO2(
            policy=policy, policy_kwargs={'view': 'odd'}, n_steps=n_steps,
            env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, 
            noptepochs=4, ent_coef=.01, learning_rate=2.5e-4,
            cliprange=lambda f: f * 0.1, verbose=1)}

    views = {view: View(models[view], peer=peer) for view in ("A", "B")}

    n_batch = n_envs * n_steps
    n_updates = num_timesteps // n_batch

    for t in range(n_updates):
        logger.info("current episode:", t)
        for view in "A", "B":
            models[view].learn(n_batch)
        if not individual:
            for view, other_view in zip(("A", "B"), ("B", "A")):
                obses, _, _, actions, _, _, _, _, _ = models[other_view].rollout
                views[view].peer = peer * scheduler(t)
                logger.info("current alpha:", views[view].peer)
                for _ in range(repeat):
                    views[view].learn(
                        obses, actions, views[view].learning_rate / repeat)

    for view in "A", "B":
        models[view].env.close()
        del models[view]  # free memory