def test_predict_SAC():
    '''
    Visualize predictions from a random policy.
    '''
    env = gym.make('KukaMujocoSAC-v0')
    model = SAC(SAC_MlpPolicy, env)
    obs = env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, rew, done, info = env.step(action, render=True)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser("Insertion, Manual mode")
    parser.add_argument('checkpoint_path', type=str, help='Path to checkpoint')
    parser.add_argument('--host',
                        default="192.168.2.121",
                        type=str,
                        help='IP of the server (default is a Windows#2)')
    parser.add_argument(
        '--port',
        default=9090,
        type=int,
        help='Port that should be used to connect to the server')
    parser.add_argument(
        '--use_coord',
        action="store_true",
        help=('If set, the environment\'s observation space will be'
              'coordinates instead of images'))
    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    env = gym.make('insertion-v0',
                   kwargs={
                       'host': args.host,
                       "port": args.port,
                       "use_coord": args.use_coord
                   })

    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}")

    if args.use_coord:
        model = SAC('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    else:
        model = SAC('CnnPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    model.load(args.checkpoint_path, env=env)

    obs = env.reset()
    for i in range(10000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Esempio n. 3
0
        "death_rate": 0.0,
    }
    for i in range(num_of_paths):

        # Path storage buckets
        episode_path = {
            "s": [],
            "r": [],
            "s_": [],
            "state_of_interest": [],
            "reference": [],
        }
        # while not dones[0]:
        s = env.reset()
        for j in range(max_ep_steps):
            action, _states = model.predict(s)
            s_, rewards, dones, infos = env.step(action)
            # Store observations
            episode_path["s"].append(s)
            episode_path["r"].append(rewards)
            episode_path["s_"].append(s_)
            info = infos[0]
            if "state_of_interest" in info.keys():
                episode_path["state_of_interest"].append(
                    np.array([info["state_of_interest"]]))
            if "reference" in info.keys():
                episode_path["reference"].append(np.array(info["reference"]))

            # Terminate if max step has been reached
            if j == (max_ep_steps - 1):
                dones[0] = True
Esempio n. 4
0
        return get_behav(state, weights={'fr': 0.3})
    except NoPathError:
        return np.zeros(env_depth * 2)


# generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100)

# pretrain model
dataset = ExpertDataset(expert_path='expert.npz')
model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1)
model.pretrain(dataset, n_epochs=5000)
model.save('pretrained_sac')

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0
i = 0
for j in range(1000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward
    i += 1
    if done:
        print(reward_sum, i, reward_sum / i)
        reward_sum = 0
        i = 0
        obs = env.reset()

env.close()
Esempio n. 5
0
def main():
    global model, best_model_path, last_model_path, sim_joy
    mission = 'PushStonesEnv'  # Change according to algorithm
    env = gym.make(mission + '-v0').unwrapped

    # Create log and model dir
    # dir = 'stable_bl/' + mission
    dir = 'stable_bl/PushMultipleStones'
    os.makedirs(dir + '/model_dir/sac', exist_ok=True)

    jobs = ['train', 'record', 'record-w/hm', 'BC_agent', 'play']
    job = jobs[1]
    pretrain = True

    if job == 'train':

        # create new folder
        try:
            tests = os.listdir(dir + '/model_dir/sac')
            indexes = []
            for item in tests:
                indexes.append(int(item.split('_')[1]))
            if not bool(indexes):
                k = 0
            else:
                k = max(indexes) + 1
        except FileNotFoundError:
            os.makedirs(dir + '/log_dir/sac')
            k = 0

        model_dir = os.getcwd() + '/' + dir + '/model_dir/sac/test_{}'.format(
            str(k))

        best_model_path = model_dir
        last_model_path = model_dir

        log_dir = dir + '/log_dir/sac/test_{}'.format(str(k))
        logger.configure(folder=log_dir,
                         format_strs=['stdout', 'log', 'csv', 'tensorboard'])

        num_timesteps = int(1e6)

        policy_kwargs = dict(layers=[64, 64, 64])

        # SAC - start learning from scratch
        model = SAC(sac_MlpPolicy,
                    env,
                    gamma=0.99,
                    learning_rate=1e-4,
                    buffer_size=500000,
                    learning_starts=0,
                    train_freq=1,
                    batch_size=64,
                    tau=0.01,
                    ent_coef='auto',
                    target_update_interval=1,
                    gradient_steps=1,
                    target_entropy='auto',
                    action_noise=None,
                    random_exploration=0.0,
                    verbose=2,
                    tensorboard_log=log_dir,
                    _init_setup_model=True,
                    full_tensorboard_log=True,
                    seed=None,
                    n_cpu_tf_sess=None)

        # Load best model and continue learning
        # models = os.listdir(dir + '/model_dir/sac')
        # models_rew = (model for model in models if 'rew' in model)
        # ind, reward = [], []
        # for model in models_rew:
        #     ind.append(model.split('_')[1])
        #     reward.append(model.split('_')[3])
        # best_reward = max(reward)
        # best_model_ind = reward.index(best_reward)
        # k = ind[best_model_ind]
        # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_rew_' + best_reward, env=env,
        #                  custom_objects=dict(learning_starts=0))
        # Load last saved model and continue learning
        # models = os.listdir(dir + '/model_dir/sac')
        # models_time = (model for model in models if 'rew' not in model)
        # ind, hour, min = [], [], []
        # for model in models_time:
        #     ind.append(model.split('_')[1])
        #     hour.append(model.split('_')[3])
        #     min.append(model.split('_')[4])
        # date = models_time[0].split('_')[2]
        # latest_hour = max(hour)
        # latest_hour_ind = [i for i, n in enumerate(hour) if n == latest_hour]
        # latest_min = max(min[latest_hour_ind])
        # latest_min_ind = min(latest_min)
        # k = ind[latest_min_ind]
        # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_' + date + '_' + latest_hour[0] + '_' + latest_min + 'zip',
        #                  env=env, custom_objects=dict(learning_starts=0))

        # model = SAC.load(dir + '/model_dir/sac/test_53_rew_24383.0',
        #                  env=env, tensorboard_log=log_dir,
        #                  custom_objects=dict(learning_starts=0, learning_rate=2e-4,
        #                                      train_freq=8, gradient_steps=4, target_update_interval=4))
        # #                                              # batch_size=32))

        # pretrain
        if pretrain:
            # load dataset only once
            # expert_dataset('3_rocks_40_episodes')
            dataset = ExpertDataset(expert_path=(os.getcwd() + '/dataset.npz'),
                                    traj_limitation=-1)
            model.pretrain(dataset, n_epochs=2000)

        # Test the pre-trained model
        # env = model.get_env()
        # obs = env.reset()
        #
        # reward_sum = 0.0
        # for _ in range(1000):
        #     action, _ = model.predict(obs)
        #     obs, reward, done, _ = env.step(action)
        #     reward_sum += reward
        #     if done:
        #         print(reward_sum)
        #         reward_sum = 0.0
        #         obs = env.reset()
        #
        # env.close()

        # learn
        model.learn(total_timesteps=num_timesteps, callback=save_fn)

        # PPO1
        # model = PPO1(Common_MlpPolicy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01,
        #      optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5,
        #      schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True,
        #      policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)

        # TRPO
        # model = TRPO(MlpPolicy, env, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1)
        # model.learn(total_timesteps=500000)
        # model.save(log_dir)

    elif job == 'record':

        mission = 'PushStonesHeatMapEnv'
        env = gym.make(mission + '-v0').unwrapped

        obs = []
        actions = []
        rewards = []
        dones = []
        episode_rewards = []

        num_episodes = 30

        listener = keyboard.Listener(on_press=on_press)
        listener.start()

        for episode in range(num_episodes):

            ob = env.reset()
            done = False
            print('Episode number ', episode + 1)
            episode_reward = 0

            while not done:

                act = "recording"
                # act = sim_joy
                # act = [0,1,0.5]
                new_ob, reward, done, info = env.step(act)

                # print(info['action'])
                # print(ob)

                if recorder_on:
                    obs.append(ob)
                    actions.append(info['action'])
                    rewards.append(reward)
                    dones.append(done)
                    episode_reward = episode_reward + reward

                ob = new_ob

            episode_rewards.append(episode_reward)

            if info['reset reason'] == 'out of boarders' or info[
                    'reset reason'] == 'limit time steps':
                episode -= 1
            else:
                print('saving data')
                data_saver(obs, actions, rewards, dones, episode_rewards)

    elif job == 'play':
        # env = gym.make('PickUpEnv-v0')
        model = SAC.load(dir + '/model_dir/sac/test_25_25_14_15',
                         env=env,
                         custom_objects=dict(learning_starts=0))  ### ADD NUM

        for _ in range(2):

            obs = env.reset()
            done = False
            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
Esempio n. 6
0
    if mode == 'train':
        env.reset()
        env.agg.case = 'rl_agg'
        model = SAC(LnMlpPolicy,
                    env,
                    learning_rate=0.03,
                    verbose=1,
                    tensorboard_log="tensorboard_logs")
        # note that the env won't record MPCCalc output for the training period
        model.learn(total_timesteps=5000, tb_log_name=model_name)
        model.save(model_name)

    obs = env.reset()
    env.agg.case = 'rl_agg'
    for t in range(1, num_steps + 1):
        action, _state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        if (t % checkpoint_interval == 0) or (t == num_steps):
            env.agg.write_outputs()

if 'dn' in run:
    env.agg.config['agg']['tou_enabled'] = False
    env.agg.config['agg']['base_price'] = 0.1
    env.agg._build_tou_price()
    env.agg.redis_add_all_data()
    for h in env.agg.all_homes_obj:
        h.initialize_environmental_variables()

    obs = env.reset()
    env.agg.case = 'baseline'
Esempio n. 7
0
                     max_n_envs=1,
                     specific_env_len=70,
                     s_len=150,
                     walls=True,
                     target_vel=params["target_vel"],
                     use_contacts=params["use_contacts"])

        print("Testing")
        policy_name = "H02"  # LX3, 63W (tiles): joints + contacts + yaw
        policy_path = 'agents/SBL_{}'.format(policy_name)
        model = SAC.load(policy_path)
        print("Loading policy from: {}".format(policy_path))

        obs = env.reset()
        for _ in range(100):
            cum_rew = 0
            t1 = time.time()
            for i in range(800):
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action, render=True)
                cum_rew += reward
                #env.render()
                if done:
                    t2 = time.time()
                    print("Time taken for episode: {}".format(t2 - t1))
                    obs = env.reset()
                    print(cum_rew)
                    break

        env.close()
Esempio n. 8
0
import gym
import numpy as np

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC

from env import OsmoEnv

if __name__ == "__main__":
    env = DummyVecEnv([lambda: OsmoEnv()])

    model = SAC(MlpPolicy, env, verbose=1, learning_rate=1e-4)
    model.learn(total_timesteps=30000)
    model.save("SAC_baselines")

    env = OsmoEnv()
    for i in range(10):
        observation = env.reset()
        done = False
        while not done:
            action, _ = model.predict(observation)
            observation, _, done, info = env.step(action)
        else:
            print(info)
Esempio n. 9
0
class SAC_SB():
    def __init__(self):
        self.love = 'Ramona'
        self.env_fns = [] 
        self.env_names = []
    
    def make_env(self, env_id, rank, seed=0):
        """
        Utility function for multiprocessed env.
    
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """
        def _init():
            env = Template_Gym()
            env.seed(seed + rank)
            return env
        set_global_seeds(seed)
        return _init
    

    def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'):
        env_id = "default"
        num_e = 32  # Number of processes to use
        # Create the vectorized environment
        #env = DummyVecEnv([lambda: env])
        #Ramona
        #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)])
        env = Template_Gym()
        self.env = DummyVecEnv([lambda: env])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" )
        
        self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1")
        #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" )
        n_timesteps = n_timesteps * save_fraction
        n_timesteps = int(n_timesteps)
        training_loop = 1 / save_fraction
        training_loop = int(training_loop)
        
        for i in range(training_loop):
            self.model.learn(n_timesteps)
            self.model.save(save+str(i))
    
    
    def evaluate(self, num_env=32, num_steps=50, load="saves/defaultlstmday", runs=10):
        """
        Evaluate a RL agent
        :param model: (BaseRLModel object) the RL Agent
        :param num_steps: (int) number of timesteps to evaluate it
        :return: (float) Mean reward
        """
        env_id = 'default'
        num_e = 1
        self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)])
        #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" )
        self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True)
        for i in range(runs):
            self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_2, tensorboard_log="./default/" )
            episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
            #self.total_pips = []
            obs = self.env.reset()
            for i in range(num_steps):
                # _states are only useful when using LSTM policies
                actions, _states = self.model.predict(obs)
                # # here, action, rewards and dones are arrays
                 # # because we are using vectorized env
                obs, rewards, dones, info = self.env.step(actions)
                #self.total_pips.append(self.env.player.placement)
      
        # Stats
                for i in range(self.env.num_envs):
                    episode_rewards[i][-1] += rewards[i]
                    if dones[i]:
                        episode_rewards[i].append(0.0)

            mean_rewards =  [0.0 for _ in range(self.env.num_envs)]
            n_episodes = 0
            for i in range(self.env.num_envs):
                mean_rewards[i] = np.mean(episode_rewards[i])     
                n_episodes += len(episode_rewards[i])   

        # Compute mean reward
            mean_reward = np.mean(mean_rewards)
            print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward
Esempio n. 10
0
            policy_kwargs={
                'layers': [64, 64],
                'reg_weight': 1e-32
            })

model.learn(total_timesteps=100000, log_interval=10)

obs, act = [], []
nb_rollouts, nb_steps = 25, 200
for n in range(nb_rollouts):
    _obs = np.empty((nb_steps, dm_obs))
    _act = np.empty((nb_steps, dm_act))

    x = env.reset()
    for t in range(nb_steps):
        u, _ = model.predict(x)
        _obs[t, :], _act[t, :] = x, u
        u = np.clip(u, -ulim, ulim)
        x, r, _, _ = env.step(u)

    obs.append(_obs)
    act.append(_act)

import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=dm_obs + dm_act, figsize=(12, 4))
for _obs, _act in zip(obs, act):
    for k, col in enumerate(ax[:-1]):
        col.plot(_obs[:, k])
    ax[-1].plot(_act)
plt.show()