Example #1
0
def test(testing_data, model_file, result):
    model = TRPO.load(model_file)

    # set testing environment
    stock_test_data = StocksData.read_csv(testing_data)
    stocks_test_env = StocksEnv(stock_test_data,
                                bars_count=10,
                                reset_on_close=False)
    obs = stocks_test_env.reset()

    # set vars for recording results
    result_df = pandas.DataFrame([],
                                 columns=['date', 'open', 'action', 'reward'])
    net_reward = 0.0

    while True:
        action, _states = model.predict(obs)
        obs, reward, done, info = stocks_test_env.step(action)

        # print and record the offset, action taken, reward, opening price
        df = pandas.DataFrame([[
            stock_test_data.date[int(info["offset"])],
            stock_test_data.open[int(info["offset"])],
            Actions(action).name, reward
        ]],
                              columns=['date', 'open', 'action', 'reward'])
        print(df)
        result_df = result_df.append(df, ignore_index=True)
        net_reward += reward

        # at end of episode, record results and exit
        if done:
            print('Net Reward: ', net_reward)
            result_df.to_csv(result, index=False)
            break
Example #2
0
def test_models(env):
    # seeds = [1, 2, 3]
    seeds = [1]

    for s in seeds:
        # Load Models
        # models = [A2C.load(f'data/models/a2c_{s}'),
        #           ACKTR.load(f'data/models/acktr_{s}'),
        #           DDPG.load(f'data/models/ddpg_{s}'),
        #           PPO2.load(f'data/models/ppo_{s}'),
        #           SAC.load(f'data/models/sac_{s}'),
        #           TD3.load(f'data/models/td3_{s}'),
        #           TRPO.load(f'data/models/trpo_{s}')]

        models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load(
            f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')]

        for m in models:
            # run_policy(m, env)
            og_params = m.get_parameters()
            generalization_test(m, env)

            for i in range(50):
                params = prune_policy(m.__class__.__name__, og_params, 0.1)
                m.load_parameters(params)
                generalization_test(m, env)
Example #3
0
def test(model_path: str, exp_config: dict):

    test_env, _ = init_env(exp_config)

    if ALG == 'ddpg':
        model = DDPG.load(model_path, env=test_env)
    elif ALG == 'trpo':
        model = TRPO.load(model_path, env=test_env)
    elif ALG == 'ppo2':
        model = PPO2.load(model_path, env=test_env)
    elif ALG == 'her':
        # model = HER.load(model_path, env=test_env)
        raise NotImplemented()
    else:
        raise ValueError(f'Unknown algorithm "{ALG}"!')

    monitor = test_env.envs[0]  # type: Monitor
    assert isinstance(monitor, Monitor)

    raw_env = monitor.unwrapped  # type: GaussianPendulumEnv
    assert isinstance(raw_env, GaussianPendulumEnv)

    raw_env.configure(seed=42,
                      mass_mean=(0.05, 1.5),
                      mass_stdev=(0.01, 0.15),
                      embed_knowledge=exp_config.get('embed_knowledge', False),
                      perfect_knowledge=exp_config.get('perfect_knowledge',
                                                       False),
                      gym_env=test_env)

    runs = np.zeros((TEST_RUNS, 4))
    fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS)

    for test_ep in range(runs.shape[0]):

        obs = test_env.reset()

        if TEST_LINSPACE_MASS:
            p = raw_env.physical_props
            raw_env.physical_props = p[0], fixed_masses[test_ep], p[2]

        mass_distr_params = raw_env.mass_distr_params.copy()
        sampled_mass = raw_env.physical_props[1]

        while True:
            action, states = model.predict(obs, deterministic=True)
            obs, rewards, dones, info = test_env.step(action)
            rewards_by_episode = monitor.episode_rewards
            episode = len(rewards_by_episode)
            if episode != test_ep:
                break

        last_tot_reward = rewards_by_episode[-1]
        runs[test_ep, :] = mass_distr_params[0], mass_distr_params[
            1], sampled_mass, last_tot_reward

    avg_reward = runs[:, 3].mean()
    print(f'Avg. test reward: {avg_reward}\n')

    return runs
Example #4
0
def load_model(path: str, env, desc: str):
    """ Loads a model from a stable baseline checkpoint file into a memory representation 

    Args:
        path        (str)           :       Path to the Stable Baseline Checkpoint File 
        env         (SB Env)        :       Path to the Stable Baseline Checkpoint File 
        desc        (str)           :       Text Description of what model this is

    Returns:
        The loaded model
    """

    if desc == "ddpg":
        return DDPG.load(path, env)
    elif desc == "ppo":
        env = DummyVecEnv([lambda: env])
        return PPO2.load(path, env)
    elif desc == "trpo":
        env = DummyVecEnv([lambda: env])
        return TRPO.load(path, env)
    elif desc == "td3":
        return TD3.load(path, env)
    elif desc == "sac":
        return SAC.load(path, env)
    else:
        raise RuntimeError(f"Model Name {desc} not supported")
Example #5
0
def train_trpo(seed):
    """
    test TRPO on the uav_env(cartesian,discrete)
    """
    """
    TRPO(policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, 
    lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, 
    tensorboard_log=None, _init_setup_model=True)
    """
    algo = 'TRPO'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    # Tested with: timesteps_per_batch=1024
    model = TRPO(policy=MlpPolicy, env=env, gamma=0.99, timesteps_per_batch=128,
                 max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01,
                 vf_stepsize=0.0003, vf_iters=3, verbose=0,
                 tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo))

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = TRPO.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
Example #6
0
def mainUp(arg):
    test = arg == TEST
    
    env = fet.FurutaEnvPosTrpoUp(cm.RUN, render = not test) 
    #env.setRender(True)
    model = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip")
    
    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_UP:
            print("\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d" % (sum(buf_rew)/float(len(buf_rew)), total_count/float(test_count), test_cutoff_count - overspeed))
            break
            
        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        buf_rew.append(episode_rew)
        if test and count <= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode average reward: %.3f\tCount: %d" % (episode_rew/count, count))
    def my_compute_data(self, args, env, params, n_episodes):
        env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env))
        for alg, start_index, end_index, step, suffix in params:
            re_d = []
            sr_d = []
            rewards, s_rates = [], []
            for i in range(start_index, end_index, step):
                print("")
                print(
                    f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}"
                )
                path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl"
                print(f"Evaluating model at {path}")
                if not os.path.exists(path):
                    print(f"WARNING: File {path} does not exist --> SKIPPING")
                    continue

                if alg == "ddpg":
                    model = DDPG.load(path)
                elif alg == "ppo":
                    model = PPO2.load(path)
                else:
                    model = TRPO.load(path)
                r, su = mean_eval(n_episodes, model, env, False, False)
                print(f"Average Success Rate: {su}")
                rewards.append(r)
                s_rates.append(su[0])

            i_max = np.argmax(s_rates)
            re_d.append(rewards)
            sr_d.append(s_rates)
            return re_d, sr_d
Example #8
0
def loader(algo, env_name):
    if algo == 'dqn':
        return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'ppo2':
        return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'a2c':
        return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'acer':
        return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl")
    elif algo == 'trpo':
        return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
Example #9
0
def mainHybrid(arg):
    test = arg == TEST
    
    env = fet.FurutaEnvPosTrpo(cm.RUN, render = not test) 
    #env.setRender(True)
    modelBal = TRPO.load(POLICY_PATH + "trpo_pos_policy_bal.zip")
    modelUp = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip")

    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    complete_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_HYBRID:
            print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" % (sum(buf_rew)/float(len(buf_rew)), test_cutoff_count - overspeed, complete_count))
            break
            
        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D):
                action, _ = modelUp.predict(obs)
            else:
                action, _ = modelBal.predict(obs)
                
            obs, rew, done, _ = env.step(action)
            
            if speedCheck(obs):
                overspeed += 1
                
            episode_rew += rew
            count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode reward: %.3f" % (episode_rew))
Example #10
0
    def f_checkpoints_range_2_mean_performance(
            self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]:
        logging.debug(
            f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}"
        )
        rewards = np.zeros(len(checkpoints))
        s_rates = np.zeros(len(checkpoints))
        # Intent
        # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint
        # - Pass that model to `mean_eval` evaluation function which will evaluate the model on
        #   - a certain number of episodes
        #   - a certain env
        #    - continuous or not continuous space
        # - an evaluation returns reward and average success rate
        #
        # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates

        j = 0
        """ NOTE: i can range in anyway while j iterates over the numpy array 
        """
        for i in checkpoints:
            path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}"
            logging.debug(f"Evaluating model at {path}")
            if self.args.model['name'] == "ddpg":
                model = DDPG.load(path)
            elif self.args.model['name'] == "ppo":
                model = PPO2.load(path)
            elif self.args.model['name'] == "trpo":
                model = TRPO.load(path)
            elif self.args.model['name'] == "td3":
                model = TD3.load(path)
            elif self.args.model['name'] == "sac":
                model = SAC.load(path)
            logging.debug(
                f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}"
            )
            rewards_list, success_rates_list = mean_eval(
                num_episodes=self.args.n_episodes,
                checkpoint_id=i,
                model=model,
                env=self.env,
                v=True,
                continuous=self.args.continuous,
                plots_dir=self.args.plots_dir)
            rewards_mean = np.mean(rewards_list)
            success_rates_mean = np.mean(success_rates_list)
            logging.debug(
                f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}"
            )
            rewards[j] = rewards_mean
            s_rates[j] = success_rates_mean
            j += 1
        return rewards, s_rates
Example #11
0
def load_model(path: str, algorithm: str):
    from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO
    if algorithm == 'PPO2':
        return PPO2.load(path)
    if algorithm == 'DQN':
        return DQN.load(path)
    if algorithm == 'A2C':
        return A2C.load(path)
    if algorithm == 'ACER':
        return ACER.load(path)
    if algorithm == 'GAIL':
        return GAIL.load(path)
    if algorithm == 'TRPO':
        return TRPO.load(path)
    return None
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v0')

    model = TRPO.load("pickbot_model_trpo_discrete_2019-03-11 10:22:01")

    while True:
        obs, done = env.reset(), False
        action, _states = model.predict(obs)
        episode_rew = 0
        while not done:
            obs, rewards, done, info = env.step(action)
            episode_rew += rewards
            print("Episode reward", episode_rew)
def render_to_gif():
    def save_frames_as_gif(frames,
                           path='./',
                           filename='growspace_with_trpo.gif'):
        # Mess with this to change frame size
        plt.figure(figsize=(frames[0].shape[1] / 72.0,
                            frames[0].shape[0] / 72.0),
                   dpi=72)

        patch = plt.imshow(frames[0])
        plt.axis('off')

        def animate(i):
            patch.set_data(frames[i])

        anim = animation.FuncAnimation(plt.gcf(),
                                       animate,
                                       frames=len(frames),
                                       interval=50)
        anim.save(path + filename, writer='imagemagick', fps=60)

    env = gym.make('GrowSpaceEnv-Control-v0')
    model = TRPO(MlpPolicy, env, verbose=1)
    # model.learn(total_timesteps=2500)
    # model.save("trpo_cartpole")

    # del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    frames = []
    obs = env.reset()
    for _ in range(150):
        # while True:
        frames.append(env.render(mode="rgb_array"))

        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        # if done:
        #     break
        # env.render()

    env.close()
    save_frames_as_gif(frames)
def render_growspace_with_trpo():
    env = gym.make('GrowSpaceEnv-Control-v0')
    model = TRPO(MlpPolicy, env, verbose=1)
    # model.learn(total_timesteps=2500)
    # model.save("trpo_cartpole")
    #
    # del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    for t in range(150):
        print(t)
        # while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)

        # if dones:
        #     env.reset()
        env.render()
Example #15
0
def trpo(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None,
         load_weights=None):
    from stable_baselines import TRPO
    env = gym.make(env_id)

    if load_weights is not None:
        model = TRPO.load(load_weights, env=env, verbose=0)
    else:
        model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="trpo", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
def visual_test(model_path: str):

    test_env, _ = init_env()

    model = TRPO.load(model_path, env=test_env)
    monitor = test_env.envs[0]  # type: Monitor
    assert isinstance(monitor, Monitor)

    raw_env = monitor.unwrapped  # type: CartPoleEnv
    assert isinstance(raw_env, CartPoleEnv)

    for _ in range(5):
        obs = test_env.reset()
        for _ in range(500):
            test_env.render()
            action, states = model.predict(obs)
            obs, rewards, dones, info = test_env.step(action)
            sleep(1./60)

    test_env.close()
Example #17
0
def train(game, num_timesteps, num_envs, dir_name, model_name,
          prev_model_name):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)
    
    log_dir = f"logs/{dir_name}/{model_name}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)
    
    env = make_vec_envs(game, False, num_envs)
    prev_model_path = f"{model_dir}/{prev_model_name}.zip"
    if prev_model_name is not None and os.path.exists(prev_model_path):
        model = TRPO.load(prev_model_path, env=env)
        model.tensorboard_log = log_dir
    else:
        model = TRPO(policy="MlpPolicy", env=env, gamma=0.8, verbose=1,
                     tensorboard_log=log_dir)
    model.learn(num_timesteps)
    model.save(f"{model_dir}/{model_name}.zip")
    env.close()
Example #18
0
File: main.py Project: ddlau/needle
def tst():
    def _init_openmpi():
        """Pre-load libmpi.dll and register OpenMPI distribution."""
        import os
        import ctypes
        if os.name != 'nt' or 'OPENMPI_HOME' in os.environ:
            return
        try:
            openmpi_home = os.path.abspath(os.path.dirname(__file__))
            openmpi_bin = os.path.join(openmpi_home, 'bin')
            os.environ['OPENMPI_HOME'] = openmpi_home
            os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH']))
            ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll'))
        except Exception:
            pass

    _init_openmpi()

    import gym

    from stable_baselines.common.policies import MlpPolicy, CnnPolicy
    from stable_baselines import TRPO

    env = gym.make('BreakoutNoFrameskip-v4')  #'CartPole-v1')

    model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("trpo_cartpole")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Example #19
0
def mainBal(arg):
    test = arg == TEST
    
    env = fet.FurutaEnvPosTrpoBal(cm.RUN, render = not test) 
    #env.setRender(not test)
    model = TRPO.load(POLICY_PATH + "trpo_pos_policy_bal.pkl")
    
    buf_rew = []
    test_cutoff_count = 0
    complete_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_BAL:
            print("\n***Average reward: %.3f\tLong runs: %d\tAverage count: %.3f\tCompleted: %d\tOverspeed: %d***\n" % (sum(buf_rew)/float(len(buf_rew)), test_cutoff_count, total_count/float(test_count), complete_count, overspeed))
            break
        
        obs, done = env.reset(), False
        #obs[4] = ARM_TARGET_RAD
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            #obs[4] = ARM_TARGET_RAD
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MIN:
            test_cutoff_count += 1
        print("Episode reward: %.3f\tCount: %d" % (episode_rew, count))
Example #20
0
def visual_test(model_path: str, exp_config: dict):

    test_env, _ = init_env(exp_config)

    if ALG == 'ddpg':
        model = DDPG.load(model_path, env=test_env)
    elif ALG == 'trpo':
        model = TRPO.load(model_path, env=test_env)
    elif ALG == 'ppo2':
        model = PPO2.load(model_path, env=test_env)
    elif ALG == 'her':
        # model = HER.load(model_path, env=test_env)
        raise NotImplemented()
    else:
        raise ValueError(f'Unknown algorithm "{ALG}"!')

    monitor = test_env.envs[0]  # type: Monitor
    assert isinstance(monitor, Monitor)

    raw_env = monitor.unwrapped  # type: GaussianPendulumEnv
    assert isinstance(raw_env, GaussianPendulumEnv)

    for _ in range(5):
        obs = test_env.reset()
        mass_distr_params = raw_env.mass_distr_params
        sampled_mass = raw_env.physical_props[1]
        print(
            f'==> distribution params: {mass_distr_params} (mean, stdev) | sampled mass: {sampled_mass}'
        )
        for _ in range(200):
            test_env.render()
            action, states = model.predict(obs, deterministic=True)
            obs, rewards, dones, info = test_env.step(action)
            sleep(1. / 60)

    test_env.close()
Example #21
0
environment = 'Swimmer-v2'
path = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str(
    run) + '_total_timesteps=' + str(
        total_timesteps) + '_trpo_episode_reward.npy'
pathmodel = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str(
    run) + '_total_timesteps=' + str(total_timesteps) + '_trpo'

env = gym.make(environment)
env = DummyVecEnv([lambda: env])

# Automatically normalize the input features
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

model = TRPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=total_timesteps, path=path, seed=seed)
model.save(pathmodel)

# Don't forget to save the running average when saving the agent
log_dir = "/tmp/"
env.save_running_average(log_dir)
'''
del model # remove to demonstrate saving and loading
'''
model = TRPO.load("")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Example #22
0
                     env,
                     n_steps=64,
                     verbose=1,
                     tensorboard_log=out_dir)
     elif args.model == 'sac':
         model = SAC("CnnPolicy", env)
     train(model, env, out_dir)
 else:
     #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl")
     path = '{}/best_model.zip'.format(args.eval)
     env = CarEnv(args.eval, cam_idx_list=(0, 3, 4))
     env.next_weather()
     #env = Monitor(env, args.eval)
     #print(env.num_envs)
     if args.model == 'trpo':
         model = TRPO.load(path)
     elif args.model == 'acer':
         model = ACER.load(path)
     elif args.model == 'ppo':
         model = PPO2.load(path)
     elif args.model == 'acktr':
         model = ACKTR.load(path)
     elif args.model == 'ddpg':
         model = DDPG.load(path)
     elif args.model == 'a2c':
         model = A2C.load(path)
     elif args.model == 'sac':
         model = SAC.load(path)
     #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True)
     #eps_rewards, eps_len = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True)
     # print(eps_rewards)
Example #23
0
    model.set_env(env)

    model.learn(total_timesteps=int(args.total_timesteps))
    # library helper
    plot_results(
        [log_dir],
        int(args.total_timesteps),
        results_plotter.X_TIMESTEPS,
        "TRPO muscle" + identifer,
    )
    plt.savefig("convergence_plot" + identifer + ".png")
    model.save("policy-" + identifer)

else:
    # Use trained policy for the simulation.
    model = TRPO.load("trpo_" + identifer)
    obs = env.reset()

    done = False
    score = 0
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        score += rewards
        if info["ctime"] > final_time:
            break
    print("Final Score:", score)
    env.post_processing(
        filename_video="video-" + identifer + ".mp4",
        SAVE_DATA=True,
    )
Example #24
0
from stable_baselines import TRPO
from stable_baselines import PPO2
from snake_env.gym_swimmer_env import SwimmerLocomotionEnv
import numpy as np

fixed_path = [(-0.2 * i, 0) for i in range(30)]

use_random_path = False
robot_k = 1.0
robot_link_length = 0.3

#these are for testing
#model = TRPO.load("trpo_swimmer")
model = TRPO.load("real_trpo_swimmer_traj_following")
env = SwimmerLocomotionEnv(path=fixed_path,
                           random_path=use_random_path,
                           use_hard_path=False,
                           robot_link_length=robot_link_length,
                           robot_k=robot_k,
                           record_trajectory=True)

obs = env.reset()
total_reward = 0
x_list = []
for i in range(10000):
    action, _states = model.predict(obs)
    #step_time = 0.5
    #action = [-0.8*np.sin(step_time*i), 0.8*np.cos(step_time*i)]
    # print("start of step")
    print(action)
    x_list.append(action[1])
Example #25
0
elif AGENT_ALGORITHM == "PPO2":
    # Create model
    model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=global_path + "tb")

    # Load if pretrained
    if PRETRAINED_MODEL:
        PPO2.load(global_path + pretrained_model_name, env=env)
        print("INFO: Loaded model " + global_path + pretrained_model_name)

elif AGENT_ALGORITHM == "TRPO":
    # Create model
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=global_path + "tb")

    # Load if pretrained
    if PRETRAINED_MODEL:
        TRPO.load(global_path + pretrained_model_name, env=env)
        print("INFO: Loaded model " + global_path + pretrained_model_name)
else:
    raise RuntimeError('ERROR: Agent not recognized')


def evaluate(model, num_steps=1000, pub=None):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :return: (float) Mean reward for the last 100 episodes
    """

    episode_rewards = [0.0]
    obs = env.reset()
Example #26
0
    def __init__(self,
                 obs_shape,
                 action_space,
                 base=None,
                 base_kwargs=None,
                 load_expert=None,
                 env_name=None,
                 rl_baseline_zoo_dir=None,
                 expert_algo=None,
                 normalize=True):
        super(Policy, self).__init__()

        #TODO: Pass these parameters in
        self.epsilon = 0.1
        self.dril = True

        if base_kwargs is None:
            base_kwargs = {}
        if base is None:
            if env_name in ['duckietown']:
                base = DuckieTownCNN
            elif len(obs_shape) == 3:
                base = CNNBase
            elif len(obs_shape) == 1:
                base = MLPBase
            else:
                raise NotImplementedError

        self.base = base(obs_shape[0], normalize=normalize, **base_kwargs)
        self.action_space = None
        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(self.base.output_size, num_outputs)
            self.action_space = "Discrete"
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(self.base.output_size, num_outputs)
            self.action_space = "Box"
        elif action_space.__class__.__name__ == "MultiBinary":
            raise Exception('Error')
        else:
            raise NotImplementedError

        if load_expert == True and env_name not in [
                'duckietown', 'highway-v0'
        ]:
            print('[Loading Expert --- Base]')
            model_path = os.path.join(rl_baseline_zoo_dir, 'trained_agents',
                                      f'{expert_algo}')
            try:
                import mpi4py
                from stable_baselines import TRPO
            except ImportError:
                mpi4py = None
                DDPG, TRPO = None, None

            from stable_baselines import PPO2

            model_path = f'{model_path}/{env_name}.pkl'
            if env_name in ['AntBulletEnv-v0']:
                baselines_model = TRPO.load(model_path)
            else:
                baselines_model = PPO2.load(model_path)
            for key, value in baselines_model.get_parameters().items():
                print(key, value.shape)

            if base.__name__ == 'CNNBase':
                print(['Loading CNNBase expert model'])
                params = copy_cnn_weights(baselines_model)
            elif load_expert == True and base.__name__ == 'MLPBase':
                print(['Loading MLPBase expert model'])
                params = copy_mlp_weights(baselines_model)

            #TODO: I am not sure what this is doing
            try:
                self.load_state_dict(params)
                self.obs_shape = obs_shape[0]
            except:
                self.base = base(obs_shape[0] + 1, **base_kwargs)
                self.load_state_dict(params)
                self.obs_shape = obs_shape[0] + 1
def main():

    parser = argparse.ArgumentParser(
        description='Plotting mechanisms for GARAT and related modifications')
    parser.add_argument('--sim_env',
                        default="InvertedPendulum-v2",
                        type=str,
                        help="Name of the simulator/source environment")
    parser.add_argument('--real_env',
                        default="InvertedPendulumModified-v2",
                        type=str,
                        help="Name of the real/target environment")
    parser.add_argument(
        '--load_policy_path',
        default=
        "data/models/TRPO_initial_policy_steps_InvertedPendulum-v2_2000000_.pkl",
        help="relative path of policy to be used for generating plots")
    parser.add_argument(
        '--load_atp_path',
        default=
        "data/models/garat/Single_GAIL_sim2real_TRPO_2000000_1000_50_0/",
        type=str,
        help="relative path for stored Action transformation policies")
    parser.add_argument('--seed', default=0, type=int, help="Random seed")
    args = parser.parse_args()

    #Set seed
    np.random.seed(args.seed)

    sim_env = gym.make(args.sim_env)
    real_env = gym.make(args.real_env)

    policy = TRPO.load(args.load_policy_path)

    action_tf_policy_list_single = []
    action_tf_policy_list_double = []
    action_tf_policy_list_shared_double = []
    action_tf_policy_list_airl = []
    num_grounding = 50

    atp_path_single = args.load_atp_path
    atp_path_double = args.load_atp_path.replace('_0', '_2')
    atp_path_shared_double = args.load_atp_path.replace('_0', '_1')
    atp_path_airl = args.load_atp_path.replace(
        'Single_GAIL_sim2real_TRPO_2000000_1000_50_0',
        'Single_AIRL_sim2real_TRPO_2000000_1000_50_1')

    print('################## Begin File loading ##################')
    for index in range(num_grounding):
        file_path_single = os.path.join(
            atp_path_single,
            "action_transformer_policy1_" + str(index) + ".pkl")
        print(file_path_single)
        action_tf_policy_list_single.append(PPO2.load(file_path_single))
        file_path_double = os.path.join(
            atp_path_double,
            "action_transformer_policy1_" + str(index) + ".pkl")
        print(file_path_double)
        action_tf_policy_list_double.append(PPO2.load(file_path_double))
        file_path_shared_double = os.path.join(
            atp_path_shared_double,
            "action_transformer_policy1_" + str(index) + ".pkl")
        print(file_path_shared_double)
        action_tf_policy_list_shared_double.append(
            PPO2.load(file_path_shared_double))
        #file_path_airl = os.path.join(atp_path_airl,"action_transformer_policy1_"+str(index)+".pkl")
        #print(file_path_airl)
        #action_tf_policy_list_airl.append(PPO2.load(file_path_airl))
    results_dict = {}
    print('################## File loading Completed ##################')

    results_single = calculate_transition_errors(sim_env, real_env, policy,
                                                 action_tf_policy_list_single)

    print('############## Begin Double Discriminator Calculations')

    results_shared_double = calculate_transition_errors(
        sim_env, real_env, policy, action_tf_policy_list_shared_double)

    results_double = calculate_transition_errors(sim_env, real_env, policy,
                                                 action_tf_policy_list_double)

    print('############## Begin AIRL Calculations')

    #results_airl = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_airl)

    results_dict['GARAT'] = results_single
    results_dict['GARAT Double Discriminator'] = results_double
    results_dict[
        'GARAT Double Discriminator (Generator LR modifications)'] = results_shared_double
    #results_dict['GARAT AIRL'] = results_airl

    plot_results(results_dict)
def _evaluation_worker(test_data,
                       model_type,
                       model_path,
                       perfect_knowledge,
                       episode_length=200,
                       mpc_sequences=2000,
                       model_kwargs=None):

    # init environment
    env = gym.make('GaussianPendulum-v0')
    model_kwargs = model_kwargs or dict()
    vision = False

    if model_type.startswith('mpc'):

        # load model
        if model_type == 'mpc-mdn':
            model = MDN_Model.load(model_path, **model_kwargs)
        elif model_type == 'mpc-mlp':
            model = MlpModel.load(model_path, **model_kwargs)
        elif model_type == 'mpc-sim':
            model = PendulumSim(env, **model_kwargs)
        elif model_type == 'mpc-vae-mlp':
            model = VaeTorchModel(model_path, **model_kwargs)
            vision = True
        else:
            raise NotImplementedError

        mpc = MPC(env,
                  model,
                  horizon=20,
                  n_action_sequences=mpc_sequences,
                  np_random=None)

        def next_action(obs):
            return mpc.get_action(obs)

        model_info = dict(type=model_type,
                          horizon=mpc.horizon,
                          sequences=mpc.n_action_sequences,
                          perfect_knowledge=perfect_knowledge)

    elif model_type == 'trpo':

        # load model
        model = TRPO.load(model_path, env=env, **model_kwargs)

        def next_action(obs):
            action, _ = model.predict(obs, deterministic=True)
            return action

        model_info = dict(type='trpo', perfect_knowledge=perfect_knowledge)

    else:
        raise NotImplementedError

    rewards = _run_model(env,
                         next_action,
                         test_data,
                         episode_length=episode_length,
                         embed_knowledge=perfect_knowledge,
                         perfect_knowledge=perfect_knowledge,
                         vision=vision)

    results = pd.DataFrame(test_data)
    results = results.assign(rewards=pd.Series(rewards).values)
    results = results.assign(model_info=[model_info] * len(results))

    return results
Example #29
0
def main():
    # parameters for the gym_carla environment
    params = {
        'number_of_vehicles': 8,
        'number_of_walkers': 0,
        'display_size': 256,  # screen size of bird-eye render
        'max_past_step': 1,  # the number of past steps to draw
        'dt': 0.1,  # time interval between two frames
        'discrete': True,  # whether to use discrete control space
        'continuous_accel_range': [-3.0, 3.0],  # continuous acceleration range
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'town': 'Town06',  # which town to simulate
        'task_mode':
        'acc_1',  # mode of the task, [random, roundabout (only for Town03)]
        'max_time_episode': 1000,  # maximum timesteps per episode
        'max_waypt': 12,  # maximum number of waypoints
        'obs_range': 32,  # observation range (meter)
        'lidar_bin': 0.125,  # bin size of lidar sensor (meter)
        'd_behind': 12,  # distance behind the ego vehicle (meter)
        'out_lane_thres': 2.0,  # threshold for out of lane
        'desired_speed': 16.67,  # desired speed (m/s)
        'max_ego_spawn_times': 200,  # maximum times to spawn ego vehicle
        'display_route': True,  # whether to render the desired route
        'pixor_size': 64,  # size of the pixor labels
        'pixor': False,  # whether to output PIXOR observation
        'RGB_cam': True,  # whether to use RGB camera sensor
    }
    solver_params = {
        'layers': [64, 64, 64],
        'alpha': 0.001,
        'gamma': 0.99,
        'epsilon': 0.1,
        'replay_memory_size': 500000,
        'update_target_estimator_every': 10000,
        'batch_size': 64,
    }
    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)
    # check_env(env)
    obs = env.reset()
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./trpo_checkpoint/',
                                             name_prefix='trpo_check')

    #model = DQN.load("./trpo_checkpoint/trpo_check_200_steps.zip",env=env,tensorboard_log="./trpo)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log="./trpo")
    model.learn(total_timesteps=35000,
                tb_log_name="35k-with-checkoint",
                callback=checkpoint_callback)
    model.save("trpo_carla")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_carla")

    obs = env.reset()
    for i in range(100):
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            if dones:
                obs = env.reset()
                break
Example #30
0
def evaluate(game, num_eps, num_envs, dir_name, model_name):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)
    
    log_dir = f"logs/{dir_name}/{model_name}"
    os.makedirs(log_dir, exist_ok=True)
    
    env = make_vec_envs(game, True, num_envs, model_name=model_name)
    model_path = f"models/{dir_name}/{model_name}.zip"
    model = TRPO.load(model_path, env=env)
    model.tensorboard_log = log_dir
    
    eps_done = 0
    ep_rewards = np.array([0] * num_eps)
    curr_rewards = [0] * num_envs
    obs = env.reset()
    while eps_done != num_eps:
        # For vectorised environments, they are automatically reset when done,
        # so returned obs would be the start state of next episode
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        env.render(mode="human")
        
        for i in range(num_envs):
            curr_rewards[i] += reward[i]
            if done[i]:
                ep_rewards[eps_done] = curr_rewards[i]
                curr_rewards[i] = 0
                eps_done += 1
    print("All episodes completed")
    env.close()
    
    mean = ep_rewards.mean()
    std_dev = ep_rewards.std()
    # Outliers: outside of 3 standard deviations
    outlier_threshold_upper = mean + 3 * std_dev
    outlier_threshold_lower = mean - 3 * std_dev
    trimmed_rewards = np.array([
        rew for rew in ep_rewards
        if outlier_threshold_lower <= rew <= outlier_threshold_upper
    ])
    avg_reward = trimmed_rewards.mean()
    best_reward = ep_rewards.max()
    print(f"Average score over {num_eps} games: {avg_reward:.2f}")
    print(f"Best score: {best_reward}")
    
    summary_writer = tf.summary.FileWriter(log_dir)
    sess = tf.Session()
    rew_var = tf.Variable(0, dtype=tf.int64)
    rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var)
    for i in range(num_eps):
        rew = ep_rewards[i]
        sess.run(rew_var.assign(rew))
        summary_writer.add_summary(sess.run(rew_val), i)
    
    best_val = tf.summary.scalar(f"Best Reward", rew_var)
    sess.run(rew_var.assign(best_reward))
    summary_writer.add_summary(sess.run(best_val), 0)
    
    avg_var = tf.Variable(0.0, dtype=tf.float64)
    avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var)
    sess.run(avg_var.assign(avg_reward))
    summary_writer.add_summary(sess.run(avg_val), 0)
    
    summary_writer.flush()
    summary_writer.close()
    sess.close()