Beispiel #1
0
def main(logdir):
    # params
    SLEEP_RATE = 100  #100Hz
    N_EPISODE = 1000
    EPISODE_TIME = 30
    EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME
    TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE

    # logdir
    logdir = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2'
    checkpoint_path = os.path.join(logdir, 'checkpoint')
    callback_path = logdir
    final_model_path = logdir + '/final_model'

    # env
    env = BlimpEnv(SLEEP_RATE)
    env = Monitor(env, logdir)
    # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir)
    print("Observation space:", env.observation_space)
    print("Shape:", env.observation_space.shape)
    print("Action space:", env.action_space)
    #
    # # callback
    SAVE_FREQ = EPISODE_LENGTH * 20  # every 1 episode
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=checkpoint_path,
                                             name_prefix='sac_callback_model')
    save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback(
        check_freq=SAVE_FREQ, log_dir=callback_path)
    callback = CallbackList(
        [checkpoint_callback, save_on_best_training_reward_callback])

    # traing got kill for some reason so continue from the checkpoint
    model_path = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2/best_model.zip'
    model = SAC.load(model_path)
    model.set_env(env)

    print("---------- Start Learing -----------")
    model.learn(total_timesteps=TOTAL_TIMESTEPS,
                log_interval=SAVE_FREQ,
                callback=callback)

    print("---------- Finish Learning ----------")
    model.save(final_model_path)
    del model  # remove to demonstrate saving and loading
    model = SAC.load(final_model_path)

    results_plotter.plot_results([logdir], TOTAL_TIMESTEPS,
                                 results_plotter.X_TIMESTEPS, "SAC BLIMP")
    plt.show()
Beispiel #2
0
def main():
    env = gym.make("teaching-env-v0",
                   teacher_path=os.path.join(os.getcwd(), "../saved_models",
                                             sys.argv[1]),
                   validation_path=DATA_PATH,
                   max_queries=config.MAX_QUERIES)
    agent_model = SAC(MlpPolicy,
                      env,
                      train_freq=1,
                      batch_size=64,
                      learning_rate=3e-4,
                      learning_starts=0,
                      buffer_size=1000,
                      random_exploration=config.EPSILON_EXPLORATION,
                      gamma=config.GAMMA,
                      verbose=1)
    #agent_model.learn(total_timesteps=config.MAX_QUERIES * config.NUM_TRAIN_EPISODES)
    #agent_model.save('test_SAC')

    agent_model.load('test_SAC', env=env)

    obs = env.reset()

    total_reward = float('-inf')
    prog = tqdm(range(config.MAX_QUERIES), postfix={'Reward': total_reward})

    actions = []  # For visualization
    total_reward = 0.0
    for i in prog:
        action = select_action(agent_model,
                               obs,
                               epsilon=config.EPSILON_EXPLORATION)
        #action, _states = agent_model.predict(obs, deterministic=False)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        prog.set_postfix({'Reward': total_reward})
        actions.append(np.asscalar(action))
    plt.hist(actions, bins=config.NUM_BINS, range=(-5, 5), density=True)
    plt.savefig('./visualizations/histograms/SAC')
    plt.clf()

    # Plot student's predicted function
    inputs = np.linspace(-5, 5, num=1000)
    outputs = env.student_model(inputs.reshape(-1, 1))
    plt.scatter(inputs, outputs, s=0.1, label='SAC')
    plt.title("SAC Student's Approximation")
    plt.ylim((-60, 100))
    plt.savefig('./visualizations/functions/SAC')
    plt.clf()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser("Insertion, Manual mode")
    parser.add_argument('checkpoint_path', type=str, help='Path to checkpoint')
    parser.add_argument('--host',
                        default="192.168.2.121",
                        type=str,
                        help='IP of the server (default is a Windows#2)')
    parser.add_argument(
        '--port',
        default=9090,
        type=int,
        help='Port that should be used to connect to the server')
    parser.add_argument(
        '--use_coord',
        action="store_true",
        help=('If set, the environment\'s observation space will be'
              'coordinates instead of images'))
    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    env = gym.make('insertion-v0',
                   kwargs={
                       'host': args.host,
                       "port": args.port,
                       "use_coord": args.use_coord
                   })

    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}")

    if args.use_coord:
        model = SAC('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    else:
        model = SAC('CnnPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    model.load(args.checkpoint_path, env=env)

    obs = env.reset()
    for i in range(10000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Beispiel #4
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = make_mujoco_env(env, 0)
    env = Monitor(env, log_dir + "/")

    continue_train = False
    if continue_train:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  #action_noise=action_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
Beispiel #5
0
def test_models(env):
    # seeds = [1, 2, 3]
    seeds = [1]

    for s in seeds:
        # Load Models
        # models = [A2C.load(f'data/models/a2c_{s}'),
        #           ACKTR.load(f'data/models/acktr_{s}'),
        #           DDPG.load(f'data/models/ddpg_{s}'),
        #           PPO2.load(f'data/models/ppo_{s}'),
        #           SAC.load(f'data/models/sac_{s}'),
        #           TD3.load(f'data/models/td3_{s}'),
        #           TRPO.load(f'data/models/trpo_{s}')]

        models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load(
            f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')]

        for m in models:
            # run_policy(m, env)
            og_params = m.get_parameters()
            generalization_test(m, env)

            for i in range(50):
                params = prune_policy(m.__class__.__name__, og_params, 0.1)
                m.load_parameters(params)
                generalization_test(m, env)
Beispiel #6
0
def example():
    # This tutorial shows how to view policies of trained actors

    task = generate_task(task_generator_id='picking')
    world_params = dict()
    world_params["skip_frame"] = 3
    world_params["seed"] = 0
    stable_baselines_policy_path = "./model_2000000_steps.zip"
    model = SAC.load(stable_baselines_policy_path)

    # define a method for the policy fn of your trained model
    def policy_fn(obs):
        return model.predict(obs, deterministic=True)[0]

    # # Record a video of the policy is done in one line
    viewer.record_video_of_policy(task=task,
                                  world_params=world_params,
                                  policy_fn=policy_fn,
                                  file_name="pushing_video",
                                  number_of_resets=10,
                                  max_time_steps=10 * 100)

    # Similarly for interactive visualization in the GUI
    viewer.view_policy(task=task,
                       world_params=world_params,
                       policy_fn=policy_fn,
                       max_time_steps=40 * 600,
                       number_of_resets=40)
Beispiel #7
0
def get_new_weights():

    v_env = PortfolioEnv(settings['data_file'], settings['output_file'],
                         settings['strategy_name'], settings['total_steps'],
                         settings['window_length'], settings['capital_base'],
                         settings['lot_size'], settings['leverage'],
                         settings['commission_percent'],
                         settings['commission_fixed'],
                         settings['max_slippage_percent'],
                         settings['start_idx'], settings['compute_indicators'],
                         settings['compute_reward'],
                         settings['compute_position'], settings['debug'])
    #   Create the vectorized environment
    #   v_env = DummyVecEnv([lambda: v_env])
    #   Normalize environment
    #   v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS)

    model = SAC.load(MODELS_DIR + settings['model_name'])

    # Strategy

    obs = v_env.reset()
    dones = False

    while not dones:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = v_env.step(action)
        #   v_env.render(mode='ansi')

    weights = v_env.current_weights

    return weights
Beispiel #8
0
def evaluate_policy(policy_file, policy_type, envname, num_rollouts):
    if policy_type == "ppo":
        model = PPO2.load(policy_file)

        def get_action(obs):
            return model.predict(obs)[0]

    elif policy_type == "sac":
        model = SAC.load(policy_file)

        def get_action(obs):
            return model.predict(obs, deterministic=True)[0]

    else:
        raise NotImplementedError()

    env = gym.make(envname)

    returns = []
    for i in range(num_rollouts):
        # print("iter", i, end=" ")
        obs = env.reset()
        done = False
        totalr = 0.0
        while not done:
            action = get_action(obs)
            obs, r, done, _ = env.step(action)
            totalr += r
        returns.append(totalr)

    return np.mean(returns), np.std(returns)
def rollout_policy(filename, traj_len, seed, env_name, n_trajs=1):
    model = SAC.load(filename)
    env = gym.make(env_name)
    env.seed(seed)

    trajs = []
    for _ in range(int(n_trajs)):
        obs_list, acts_list, rews_list = [], [], []
        obs = env.reset()
        obs_list.append(obs)
        for _ in range(traj_len):
            act = model.predict(obs, deterministic=True)[0]
            obs, r, done, _ = env.step(act)
            # assert not done
            acts_list.append(act)
            obs_list.append(obs)
            rews_list.append(r)

        infos = [{} for _ in range(traj_len)]
        traj = types.TrajectoryWithRew(
            obs=np.array(obs_list),
            acts=np.array(acts_list),
            infos=infos,
            rews=np.array(rews_list),
        )
        trajs.append(traj)

    return trajs
Beispiel #10
0
def load_model(path: str, env, desc: str):
    """ Loads a model from a stable baseline checkpoint file into a memory representation 

    Args:
        path        (str)           :       Path to the Stable Baseline Checkpoint File 
        env         (SB Env)        :       Path to the Stable Baseline Checkpoint File 
        desc        (str)           :       Text Description of what model this is

    Returns:
        The loaded model
    """

    if desc == "ddpg":
        return DDPG.load(path, env)
    elif desc == "ppo":
        env = DummyVecEnv([lambda: env])
        return PPO2.load(path, env)
    elif desc == "trpo":
        env = DummyVecEnv([lambda: env])
        return TRPO.load(path, env)
    elif desc == "td3":
        return TD3.load(path, env)
    elif desc == "sac":
        return SAC.load(path, env)
    else:
        raise RuntimeError(f"Model Name {desc} not supported")
Beispiel #11
0
def test_SAC(env, out_dir, seed=None, **kwargs):
    model = SAC.load(os.path.join(out_dir, 'final_model'), env=env)
    env.seed(seed)

    # Evaluate the trained agent
    mean_reward = evaluate(env, model, out_dir, num_episodes=20)

    return
Beispiel #12
0
def play():
    model = SAC.load(expDir + "/%s/%d" %
                     (name, np.format_float_scientific(nIter)))
    env = gym.make('PointMassDense-1-v1')
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render(mode='human')
Beispiel #13
0
    def __init__(self, env):

        self.env = env

        load_path_rl="/home/icv/Trustworth/TiRL/models/sac-5"
        log_path_rl="/home/icv/Trustworth/TiRL/data/sac-5"
        
        self.model_rl = SAC.load(load_path_rl, env=env, tensorboard_log=log_path_rl)

        load_path_rule="/home/icv/Trustworth/TiRL/models/sac_rule3"
        log_path_rule="/home/icv/Trustworth/TiRL/data/sac_rule3"
        
        self.model_rule = SAC.load(load_path_rule, env=env, tensorboard_log=log_path_rule)
        self.agent_rule = IDM(env)

        print("load model successfully")

        self.reset()
Beispiel #14
0
def play(save_dir, env):
    model = SAC.load(save_dir + '/model_dir/sac/test_25_25_14_15',
                     env=env,
                     custom_objects=dict(learning_starts=0))  ### ADD NUM
    for _ in range(2):

        obs = env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
Beispiel #15
0
def load_model(config):
    model = None
    if config["algo_name"] == "TD3":
        model = TD3.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "A2C":
        model = A2C.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "SAC":
        model = SAC.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "PPO2":
        model = PPO2.load("agents/{}".format(args["test_agent_path"]))
    assert model is not None, "Alg name not found, cannot load model, exiting. "
    return model
Beispiel #16
0
    def f_checkpoints_range_2_mean_performance(
            self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]:
        logging.debug(
            f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}"
        )
        rewards = np.zeros(len(checkpoints))
        s_rates = np.zeros(len(checkpoints))
        # Intent
        # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint
        # - Pass that model to `mean_eval` evaluation function which will evaluate the model on
        #   - a certain number of episodes
        #   - a certain env
        #    - continuous or not continuous space
        # - an evaluation returns reward and average success rate
        #
        # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates

        j = 0
        """ NOTE: i can range in anyway while j iterates over the numpy array 
        """
        for i in checkpoints:
            path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}"
            logging.debug(f"Evaluating model at {path}")
            if self.args.model['name'] == "ddpg":
                model = DDPG.load(path)
            elif self.args.model['name'] == "ppo":
                model = PPO2.load(path)
            elif self.args.model['name'] == "trpo":
                model = TRPO.load(path)
            elif self.args.model['name'] == "td3":
                model = TD3.load(path)
            elif self.args.model['name'] == "sac":
                model = SAC.load(path)
            logging.debug(
                f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}"
            )
            rewards_list, success_rates_list = mean_eval(
                num_episodes=self.args.n_episodes,
                checkpoint_id=i,
                model=model,
                env=self.env,
                v=True,
                continuous=self.args.continuous,
                plots_dir=self.args.plots_dir)
            rewards_mean = np.mean(rewards_list)
            success_rates_mean = np.mean(success_rates_list)
            logging.debug(
                f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}"
            )
            rewards[j] = rewards_mean
            s_rates[j] = success_rates_mean
            j += 1
        return rewards, s_rates
Beispiel #17
0
    def load_model(self):
        model_path = "data/saved_models/"
        if folder:
            model_path = model_path + self.folder + "/"
        else:
            model_path = model_path + self.model_name + "/"

        model_path = model_path + self.model_name

        if self.episode:
            model_path = model_path + "_" + self.episode + ".pkl"

        self.model = SAC.load(model_path)
def load_model(model_path, params):
    env_cls = globals()[params['env']]
    orig_env = env_cls(**params['env_options'])
    env = DummyVecEnv([lambda: orig_env])

    if params['alg'] == 'PPO2':
        model = PPO2.load(model_path, env=env)
    elif params['alg'] == 'SAC':
        model = SAC.load(model_path, env=env)
    else:
        raise NotImplementedError

    return orig_env, model
Beispiel #19
0
def record(exp):
    model = SAC.load(exp)
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos_2"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model.set_env(env)
    model.learn(total_timesteps=2000, log_interval=100)
    # model.save(expDir + "/%s/%d" %(name, nIter))
    env.close()
Beispiel #20
0
    def __init__(self, env):

        self.T = 1.5
        self.g0 = 4
        self.a = 0.73
        self.b = 1.67
        self.delta = 4
        self.decision_dt = 0.75
        self.length_x = 5  # front vehicle length
        self.env = env

        load_path = "/home/icv/Trustworth/TiRL/models/sac-5"
        log_path = "/home/icv/Trustworth/TiRL/data/sac-5"

        self.model = SAC.load(load_path, env=env, tensorboard_log=log_path)
        print("load model successfully")
Beispiel #21
0
def test_single_episode(model):
    act = SAC.load(model)
    done = False

    #for i in range(1):
    obs, done = env._validate(1,8,1.0,0.585), False
    episode_rew = 0
    actions = list()
    while not done:
        #env.render()
        action = act(obs[None])[0]
        #obs, rew, done, _ = env.step(act(obs[None])[0])
        obs, rew, done, _ = env.step(action)
        episode_rew += rew
    print("Episode total reward", episode_rew)

    return episode_rew
Beispiel #22
0
def mk_env_agent(env_class, registered_model, params, gui=False):
    model = SAC.load(registered_model.source)

    params_fname = f'{registered_model.source}.json'  # FIXME
    with open(params_fname, 'r') as fp:
        loaded_params = json.load(fp)

    params = {**loaded_params, **params}  # merge, overriding loaded params
    env = make_vec_env(lambda: env_class(params['NJ'], params, gui=gui),
                       n_envs=1)

    model.set_env(env)
    env.env_method('set_render_info', {
        'name': registered_model.name,
        'version': registered_model.version
    })  # FIXME

    return env, model
Beispiel #23
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = gym.make(env)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    # Delete keys so the dict can be pass to the model constructor
    # policy = kwargs['policy']
    policy = 'MlpPolicy'
    # n_timesteps = kwargs['n_timesteps']
    n_timesteps = int(1e6)
    noise_type = None
    # Add some param noise for exploration
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                         desired_action_stddev=0.1)

    continue_model = False
    if continue_model is True:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  # action_noise=param_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
Beispiel #24
0
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None,
        load_weights=None):
    env = gym.make(env_id)

    if load_weights is not None:
        model = SAC.load(load_weights, env, verbose=0)
    else:
        model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
Beispiel #25
0
def get_SAC_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path):
    policy_kwargs = dict(layers=model_settings['NET_LAYERS'])
    env = get_single_process_env(model_settings, model_path, ckpt_step)
    if ckpt_path is not None:
        print("Loading model from checkpoint '{}'".format(ckpt_path))
        model = SAC.load(ckpt_path,
                         env=env,
                         _init_setup_model=True,
                         policy_kwargs=policy_kwargs,
                         **model_settings['train_configs'],
                         verbose=1,
                         tensorboard_log=tb_path)
        model.num_timesteps = ckpt_step
    else:
        model = SAC(SACMlpPolicy,
                    env,
                    _init_setup_model=True,
                    policy_kwargs=policy_kwargs,
                    **model_settings['train_configs'],
                    verbose=1,
                    tensorboard_log=tb_path)
    return model, env
def get_trajectories(env, policy_path, policy_type, n_rollouts, time_horizon):
    if policy_type == "sac":
        from stable_baselines import SAC

        model = SAC.load(policy_path)

        def get_action(obs):
            return model.predict(obs, deterministic=True)[0]

    elif policy_type == "gail":
        from imitation.policies import serialize
        from stable_baselines3.common.vec_env import DummyVecEnv

        venv = DummyVecEnv([lambda: env])
        model = serialize.load_policy("ppo", policy_path, venv)

        def get_action(obs):
            return model.predict(obs)[0]

    elif policy_type == "dads":
        data = load_data(policy_path)
        return data["observations"]
    else:
        raise NotImplementedError()

    trajectories = []

    for _ in range(n_rollouts):
        trajectory = []
        obs = env.reset()
        trajectory.append(list(obs))
        for t in range(time_horizon - 1):
            action = get_action(obs)
            # trajectory.extend(list(action))
            obs, reward, done, info = env.step(action)
            trajectory.append(list(obs))
        trajectories.append(trajectory)

    return trajectories
Beispiel #27
0
inference = True
# Enjoy trained agent
num_of_paths = 1
max_ep_steps = 800
algorithm = "SAC"  # PPO2, SAC, DDPG
model_save_name = "SAC_1_Ex3_EKF_gyro-v0_model_1"  #"ppo2_ekf_0", "sac_ekf_model_2"
env_name = 'Ex3_EKF_gyro-v0'  # 'Ex3_EKF_gyro-v0', 'Pendulum-v0','Ex3_pureEKF_gyro'

if algorithm == "PPO2":
    from stable_baselines.common import make_vec_env
    from stable_baselines import PPO2
    model = PPO2.load(model_save_name)
    env = make_vec_env(env_name)
elif algorithm == "SAC":
    from stable_baselines import SAC
    model = SAC.load(model_save_name)
    env = gym.make(env_name)
elif algorithm == "DDPG":
    from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
    from stable_baselines import DDPG
    model = DDPG.load(model_save_name)
    env = gym.make(env_name)

if inference:
    save_figs = False
    LOG_PATH = "./logs"
    fig_file_type = "pdf"
    roll_out_paths = {}
    roll_out_paths = {
        "s": [],
        "r": [],
Beispiel #28
0
                         action_noise=action_noise,
                         tensorboard_log=tensorboard_log_dir)
            model.learn(total_timesteps=total_timesteps_,
                        log_interval=1,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)

del model  # remove to demonstrate saving and loading

if inference:
    if algorithm == "PPO2":
        env = make_vec_env('Ex3_EKF_gyro-v0')
        model = PPO2.load("ppo2_ekf_0")
    elif algorithm == "SAC":
        env = gym.make('Ex3_EKF_gyro-v0')
        model = SAC.load("sac_ekf_model_0")
    # Enjoy trained agent
    num_of_paths = 1
    max_ep_steps = 1000
    save_figs = False
    LOG_PATH = "./logs"
    fig_file_type = "pdf"
    roll_out_paths = {}
    roll_out_paths = {
        "s": [],
        "r": [],
        "s_": [],
        "state_of_interest": [],
        "reference": [],
        "episode_length": [],
        "return": [],
Beispiel #29
0
import gym
import numpy as np

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines import SAC

env = gym.make('BipedalWalkerHardcore-v2')

model = SAC.load("sac_walker500000")

obs = env.reset()
for i in range(0, 1500):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    #print(rewards)
    env.render()
    if rewards == -100:
        break
env.close()
import gym
import numpy as np
import imageio

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC

env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=1000, log_interval=10)

model.save("../models/sac_pendulum")

del model  # remove to demonstrate saving and loading

model = SAC.load("../models/sac_pendulum")

#obs = env.reset()
#while True:
#    action, _states = model.predict(obs)
#    obs, rewards, dones, info = env.step(action)
#    env.render()