Esempi in Python per RLPyEnv, esempi in Python per GymEnvs.RLPyEnv

Esempio n. 1

0

Mostra file

File: gradient_experiment.py Progetto: parthchadha/metarl

def run_scratch(env_name):
    import os.path as osp
    # acrobot.torque_noise_max = 0.05
    env = RLPyEnv(env_name())
    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(8, 8))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=100,
        discount=0.995,
        step_size=0.01,
        # plot=True,
    )
    # algo.train()
    # rollout(env, policy)
    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        # script="scripts/run_experiment_lite.py",
        log_dir="Final_Results/Gradient/Scratch",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        # plot=True,
    )

Esempio n. 2

0

Mostra file

def slideturn_from_scratch(num=1, directory="./Final_Results/Car/SlideTurn/", exp_name="Base_tmp", save=False):
    rccar = RCCarSlideTurn(noise=0.1)
    env =  RLPyEnv(rccar)
    dir_name = os.path.join(directory, exp_name)
    for i in range(num):
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
        policy = CategoricalMLPPolicy(
            env_spec=env.spec,
        )
        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1000,
            max_path_length=env.horizon,
            n_itr=500,
            discount=0.995,
            step_size=0.001,
            # plot=True,
        )
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            exp_name=exp_name + str(i),
            log_dir=os.path.join(dir_name, timestamp) if save else './Results/Tmp',
            # plot=True,
        )

Esempio n. 3

0

Mostra file

File: acrobot_experiment.py Progetto: parthchadha/metarl

def test(num=1, path="./Results/Tmp", save=False):
    policies = [
        PolicyLoader("models/Acrobot/" + path)
        for path in ['Mass2_Light', 'Mass1_Light', 'Mass1_Heavy'][:]
    ]
    # directory = os.path.join(directory, exp_name)
    acrobot = ModifiedAcrobot()
    acrobot.dt = 0.1
    acrobot.episodeCap = 1000
    # acrobot.torque_noise_max = 0.05
    domain = RLPyEnv(acrobot)
    env = HRLEnv(domain, policies)
    # env = DoublePendulumEnv()
    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(8, 8))
    # rollout(env, policy)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.995,
        step_size=0.001,
        # plot=True,
    )
    algo.train()

Esempio n. 4

0

Mostra file

def generate_slide_model(num=1, directory="./Results/Car/Slide/", exp_name="Base", save=False):
    rccar = RCCarLeft(noise=0.)
    env =  RLPyEnv(rccar)
    now = datetime.datetime.now()
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=8000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.9,
        step_size=0.01,
        # plot=True,
    )
    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        exp_name=exp_name + timestamp,
        log_dir=os.path.join(directory, exp_name) if save else './Results/Tmp',
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=1,
        # plot=True,
    )

Esempio n. 5

0

Mostra file

def good_inverted_car(directory="./Results/Car/Noisy_500/"):
    rccar = RCCarBarriers(noise=0.)
    policies = [
        PolicyLoader("models/noisy/" + path) for path in ['good', 'other']
    ]
    domain = RLPyEnv(rccar)
    env = HRLEnv(domain, policies)
    policy = CategoricalMLPPolicy(env_spec=env.spec, )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=500,
        discount=0.9,
        step_size=0.001,
        # plot=True,
    )
    assert False, "Make sure to change logging directory before rerunning this experiment"

    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        log_dir=directory,
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=1,
        # plot=True,
    )

Esempio n. 6

0

Mostra file

File: bandits.py Progetto: parthchadha/metarl

def noisy_bandits():
	rccar = RCCarBarriers(noise=0.1)
	policies = [PolicyLoader("models/noisy/" +path) for path in ['good','untrained', 'untrained', 'untrained'] ]
	domain = RLPyEnv(rccar)
	band = Bandits(policies, domain, N=100, rmax=rccar.GOAL_REWARD, rmin=-rccar.episodeCap - 20)
	chosen = band.run()
	import joblib; joblib.dump(band.choices, "Results/Bandits/Noisy")
	return band

Esempio n. 7

0

Mostra file

File: bandits.py Progetto: parthchadha/metarl

def two_policy_bandits():
	rccar = RCCarSlideTurn(noise=0.1) # remove process noise
	domain = RLPyEnv(rccar)
	policies = [PolicyLoader("models/slideturn_experiment/" + path) for path in ['agent0','agent1'] ]
	band = Bandits(policies, domain, N=100, rmax=rccar.GOAL_REWARD, rmin=-rccar.episodeCap)
	chosen = band.run()
	import joblib; joblib.dump(band.choices, "Results/Bandits/TwoPolicy")
	return band

Esempio n. 8

0

Mostra file

def good_double_bad_car(num=1,
                        directory="./Results/Car/CarNoisy/",
                        exp_name="Noisy_2_untrained",
                        save=False):
    rccar = RCCarBarriers(noise=0.1)
    policies = [
        PolicyLoader("models/noisy/" + path)
        for path in ['good', 'untrained', 'untrained']
    ]
    domain = RLPyEnv(rccar)
    env = HRLEnv(domain, policies)
    # policy = CategoricalMLPPolicy(
    #     env_spec=env.spec,
    # )
    # baseline = LinearFeatureBaseline(env_spec=env.spec)
    # algo = TRPO(
    #     env=env,
    #     policy=policy,
    #     baseline=baseline,
    #     batch_size=4000,
    #     max_path_length=env.horizon,
    #     n_itr=500,
    #     discount=0.9,
    #     step_size=0.001,
    #     # plot=True,
    # )
    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    for i in range(num):
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=env.horizon,
            n_itr=600,
            discount=0.9,
            step_size=0.0001,
            # plot=True,
        )

        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            exp_name=exp_name + timestamp,
            log_dir=os.path.join(directory, exp_name +
                                 timestamp) if save else "./Results/Tmp",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

Esempio n. 9

0

Mostra file

def good_x_cars(num_agents=5,
                directory="./Final_Results/Car/CarNoisyAgents/",
                exp_name="NoisyTest",
                save=False):

    rccar = RCCarBarriers(noise=0.1)
    policies = [
        PolicyLoader("models/noisy/" + path) for path in [
            'good', 'untrained', 'untrained', 'untrained', 'untrained',
            'untrained'
        ]
    ][:1 + num_agents]
    domain = RLPyEnv(rccar)
    env = HRLEnv(domain, policies)
    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    exp_name = exp_name + str(num_agents)
    directory = os.path.join(directory, exp_name)
    for i in range(3):
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2000,
            max_path_length=env.horizon,
            n_itr=500,
            discount=.995,
            step_size=0.001,
            # plot=True,
        )
        # algo.train()
        # rollout(env, policy)
        try:
            os.mkdir(directory)
        except Exception:
            pass
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            exp_name=exp_name,
            log_dir=os.path.join(directory, timestamp)
            if save else "./Results/Tmp",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

Esempio n. 10

0

Mostra file

def slideturn_noisy(val=0.1,
                    directory="./Results/Car/NoisyObs500/",
                    exp_name="Cap_",
                    save=False):
    policies = [
        PolicyLoader("models/slideturn_experiment/" + path)
        for path in ['agent0', 'agent1']
    ]
    rccar = RCCarSlideTurn(noise=0.)  # remove process noise

    domain = RLPyEnv(rccar)
    original_env = HRLEnv(domain, policies)
    env = NoisyObservationEnv(original_env, obs_noise=val)
    policy = CategoricalMLPPolicy(env_spec=env.spec, )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    dir_name = os.path.join(directory, exp_name)
    for i in range(1):
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=env.horizon,
            n_itr=500,
            discount=0.9,
            step_size=0.01,
            # plot=True,
        )
        # algo.train()
        # rollout(env, policy)
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            exp_name=exp_name + timestamp,
            log_dir=os.path.join(dir_name, timestamp)
            if save else './Results/Tmp2',
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

Esempio n. 11

0

Mostra file

File: Dropped_Experiment.py Progetto: parthchadha/metarl

def scratch_slideturn_dropped_rec(
        val=0.1,
        directory="./Results/Car/Scratch/DroppedObsRec/",
        exp_name="Cap_",
        save=False):
    rccar = RCCarSlideTurn(noise=0.)  # remove process noise
    original_env = RLPyEnv(rccar)
    env = DroppedObservationEnv(original_env, drop_prob=val)
    policy = CategoricalGRUPolicy(env_spec=env.spec, )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    for i in range(5):
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=env.horizon,
            n_itr=500,
            discount=0.995,
            step_size=0.01,
            # plot=True,
        )
        # algo.train()
        # rollout(env, policy)
        dir_name = os.path.join(directory, exp_name)
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            exp_name=exp_name + timestamp,
            log_dir=os.path.join(dir_name, timestamp)
            if save else './Results/Tmp2',
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

Esempio n. 12

0

Mostra file

def slideturn_turn_only(num=1, directory="./Final_Results/Car/SlideTurn/", exp_name="Turn", save=True):
    policies = [PolicyLoader("models/slideturn_experiment/" + path) for path in ['agent1'] ]
    directory = os.path.join(directory, exp_name)
    for i in range(num):
        rccar = RCCarSlideTurn(noise=0.1)
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

        domain = RLPyEnv(rccar)
        env = HRLEnv(domain, policies)
        policy = CategoricalMLPPolicy(
            env_spec=env.spec,
        )
        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1000,
            max_path_length=env.horizon,
            n_itr=500 + 3,
            discount=0.995,
            step_size=0.001,
            # plot=True,
        )
        # algo.train()
        # rollout(env, policy)
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            exp_name=exp_name,
            log_dir=os.path.join(directory, timestamp) if save else './Results/Tmp',
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

Esempio n. 13

0

Mostra file

File: gradient_experiment.py Progetto: parthchadha/metarl

def check_base(env_name):
    import os.path as osp
    for path in ['RCCarRightTurnGradient', 'RCCarSlideLeftGradient']:
        policies = [
            load_rllab_policy(
                osp.join("models/rc_gradient/" + path, "params.pkl"))
        ]
        # directory = os.path.join(directory, exp_name)
        # acrobot.torque_noise_max = 0.05
        domain = RLPyEnv(env_name())
        env = HRLEnv(domain, policies)
        # env = DoublePendulumEnv()
        policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(8, 8))
        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=env.horizon,
            n_itr=100,
            discount=0.995,
            step_size=0.01,
            # plot=True,
        )
        # algo.train()
        # rollout(env, policy)
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            script="scripts/run_experiment_lite_rl.py",
            # script="scripts/run_experiment_lite.py",
            log_dir="Final_Results/Gradient/" + path,
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

Esempio n. 14

0

Mostra file

def switch_from_turn(exp_name="Switch_Turn", num=1, directory="./Results/Car/SlideTurn/", save=True):
    """Environment begins with Turn bias, switches to mixed after 1e5 calls to the step function"""
    rccar = RCCarSlideTurn(noise=0.1)
    env =  RLPyEnv(rccar)
    now = datetime.datetime.now()
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
    import joblib
    data = joblib.load("Results/Car/Turn/Base/params.pkl") # LOAD POLICY
    policy = data['policy']
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=100,
        discount=0.9,
        step_size=0.01,
        # plot=True,
    )
    # algo.train()
    # rollout(env, policy)

    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        exp_name=exp_name + timestamp,
        log_dir=os.path.join(directory, exp_name) if save else './Results/Tmp',
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=1,
        # plot=True,
    )

Esempio n. 15

0

Mostra file

File: gradient_experiment.py Progetto: parthchadha/metarl

def train_domain(domain):
    rc = domain()
    env = RLPyEnv(rc)
    # env = ControllerEnv(k=10)
    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(
            16,
            16,
        ),
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=100,
        discount=0.995,
        step_size=0.01,
        plot=False,
    )
    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        # script="scripts/run_experiment_lite.py",
        log_dir="models/rc_gradient/" + domain.proxy_class.__name__,
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        # plot=True,
    )

Esempio n. 16

0

Mostra file

File: gradient_experiment.py Progetto: parthchadha/metarl

    return sum(successes) * 1. / 100


if __name__ == '__main__':
    import os.path as osp

    train_domain(RCCarRightTurnGradient)
    train_domain(RCCarSlideLeftGradient)
    create_meta(RCCarTurnSlideGradient)
    check_base(RCCarTurnSlideGradient)
    run_scratch(RCCarTurnSlideGradient)

    policies = [
        load_rllab_policy(osp.join("models/rc_gradient/" + path, "params.pkl"))
        for path in [
            'RCCarRightTurnGradient',
            'RCCarSlideLeftGradient',
        ]
    ]
    domain = RLPyEnv(RCCarTurnSlideGradient())
    env = HRLEnv(domain, policies)
    main_policies = [
        load_rllab_policy(
            osp.join("Final_Results/Gradient/" + path, "params.pkl"))
        for path in ['Joined', 'Scratch']
    ]
    print evaluate_prob_success(env, main_policies[0])
    print evaluate_prob_success(domain, main_policies[1])

    # for pi in policies:
    #     print evaluate_prob_success(domain, pi)

Esempio n. 17

0

Mostra file

File: car_rollout.py Progetto: parthchadha/metarl

def rollout(env, policy, N=100, force_act=None):
    # if env.__class__.__name__ == "StandardControllerEnv" or env.__class__.__name__ == "ControllerEnv":
    #     import numpy as np
    #     xmax, ymax = env.observation_space.high
    #     xmin, ymin = env.observation_space.low
    #     X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    #     positions = np.vstack([X.ravel(), Y.ravel()])
    #     Z = np.reshape(env._regions.predict(positions.T), X.shape)
    #     plt.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
    #               extent=[xmin, xmax, ymin, ymax])
    # visualize_reward(env)
    # sums = visualize_distribution(env, policy)
    T = env.horizon

    xmin, xmax = -3.5, 3.5
    ymin, ymax = -2, 2
    env = RLPyEnv(RCCarSlideGradient())
    print "!!!!!!!!!!!!!!!!!!!!!!!!!!! Make sure you have right environment"
    print "!!!!!!!!!!!!!!!!!!!!!!!!!!! Make sure you have right environment"
    print "!!!!!!!!!!!!!!!!!!!!!!!!!!! Make sure you have right environment"
    import ipdb
    ipdb.set_trace()  # breakpoint cc480b65 //

    def show_interactive_traj(observations, num=0, color='r', style='x'):
        traj_x, traj_y = zip(*observations)
        try:
            line = plt.axes().lines[num]
            if len(line.get_xdata()) != len(
                    traj_x):  # if plot has discrepancy from data
                line.set_xdata(traj_x)
                line.set_ydata(traj_y)
        except IndexError:
            plt.plot(traj_x, traj_y, style, color=color)

    def show_trajectory(observations, actions, style='o'):
        try:
            observations_0 = [
                xa[0] for xa in zip(observations, actions) if xa[1] == 0
            ]
            traj_x, traj_y = zip(*observations_0)
            plt.plot(traj_x, traj_y, style, color='r', alpha=0.05)
        except Exception:
            pass
        try:
            observations_1 = [
                xa[0] for xa in zip(observations, actions) if xa[1] == 1
            ]
            traj_x, traj_y = zip(*observations_1)
            plt.plot(traj_x, traj_y, style, color='b', alpha=0.05)
            plt.show()
        except Exception:
            pass

    # plt.imshow(np.rot90(sums), cmap=plt.cm.PuOr, extent=[xmin, xmax, ymin, ymax])
    for _ in xrange(N):
        full_observations = []
        observations = []
        actions = []
        rewards = []

        observation = env.reset()
        T = env.horizon
        # env.render()

        for _ in xrange(T):
            # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
            # sufficient statistics for the action distribution. It should at least contain entries that would be
            # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
            # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
            # not needed.
            env.render()
            action, _ = policy.get_action(observation)
            if force_act is not None:
                action = force_act
            full_observations.append(observation)
            observations.append(observation[:2])
            actions.append(action)
            # action = policy.action_space.sample()
            # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
            # case it is not needed.
            next_observation, reward, terminal, _ = env.step(action)
            rewards.append(reward)
            observation = next_observation
            # observation_0 = [xa[0] for xa in zip(observations, actions) if xa[1] == 0]
            # observation_1 = [xa[0] for xa in zip(observations, actions) if xa[1] == 1]
            # if len(observation_0):
            #     show_interactive_traj(observation_0)
            # if len(observation_1):
            #     show_interactive_traj(observation_1, num=1, color='b', style='o')

            # totalobvs.append(observation)
            if terminal:
                observations.append(observation[:2])
                # Finish rollout if terminal state reached
                break
        print sum(rewards)
        # show_trajectory(observations, actions)

        # x_list, y_list = zip(*observations)
    print observations

Esempio n. 18

0

Mostra file

            rewards.append(reward)
            observation = next_observation
            # totalobvs.append(observation)
            if terminal:
                # Finish rollout if terminal state reached
                break
        x_list, y_list = zip(*observations)
        print observations
        print actions
        plt.plot(x_list, y_list)
        plt.show()


# rl = ModifiedAcrobot()
rc = RCCarSlideLeftGradient()
env = RLPyEnv(rc)
# env = ControllerEnv(k=10)
policy = CategoricalMLPPolicy(
    env_spec=env.spec,
    hidden_sizes=(32,32,),
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=3000,
    max_path_length=env.horizon,
    n_itr=100,
    discount=0.995,
    step_size=0.01,