Ejemplo n.º 1
0
    model = DQN(dim, K, sizes, gamma)
    tmodel = DQN(dim, K, sizes, gamma)

    #Session
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    model.set_session(session)
    tmodel.set_session(session)

    #Directory to save results
    if not os.path.exists('Results'):
        os.mkdir('Results')
    os.chdir('Results')
    env = wrappers.Monitor(env,
                           'Policy_Gradient_Hill_Climbing_Result',
                           force=True)

    N = 500
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(env, model, tmodel, eps, gamma, copy_period)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print("episode:", n, "total reward:", totalreward, "eps:", eps,
                  "avg reward (last 100):",
                  totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
Ejemplo n.º 2
0
    def __init__(self, params):

        #############
        ## INIT
        #############

        # Get params, create logger, create TF session
        self.params = params
        self.logger = Logger(self.params['logdir'])
        self.sess = create_tf_session(self.params['use_gpu'],
                                      which_gpu=self.params['which_gpu'])

        # Set random seeds
        seed = self.params['seed']
        tf.set_random_seed(seed)
        np.random.seed(seed)

        #############
        ## ENV
        #############

        # Make the gym environment
        self.env = gym.make(self.params['env_name'])
        if 'env_wrappers' in self.params:
            # These operations are currently only for Atari envs
            self.env = wrappers.Monitor(self.env,
                                        os.path.join(self.params['logdir'],
                                                     "gym"),
                                        force=True)
            self.env = params['env_wrappers'](self.env)
            self.mean_episode_reward = -float('nan')
            self.best_mean_episode_reward = -float('inf')
        self.env.seed(seed)

        # import plotting (locally if 'obstacles' env)
        if not (self.params['env_name'] == 'obstacles-cs285-v0'):
            import matplotlib
            matplotlib.use('Agg')

        # Maximum length for episodes
        self.params['ep_len'] = self.params[
            'ep_len'] or self.env.spec.max_episode_steps
        global MAX_VIDEO_LEN
        MAX_VIDEO_LEN = self.params['ep_len']

        # Is this env continuous, or self.discrete?
        discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        # Are the observations images?
        img = len(self.env.observation_space.shape) > 2

        self.params['agent_params']['discrete'] = discrete

        # Observation and action sizes

        ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[
            0]
        ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[
            0]
        self.params['agent_params']['ac_dim'] = ac_dim
        self.params['agent_params']['ob_dim'] = ob_dim

        # simulation timestep, will be used for video saving
        if 'model' in dir(self.env):
            self.fps = 1 / self.env.model.opt.timestep
        elif 'env_wrappers' in self.params:
            self.fps = 30  # This is not actually used when using the Monitor wrapper
        elif 'video.frames_per_second' in self.env.env.metadata.keys():
            self.fps = self.env.env.metadata['video.frames_per_second']
        else:
            self.fps = 10

            #############
        ## AGENT
        #############

        agent_class = self.params['agent_class']
        self.agent = agent_class(self.sess, self.env,
                                 self.params['agent_params'])

        #############
        ## INIT VARS
        #############

        tf.global_variables_initializer().run(session=self.sess)
Ejemplo n.º 3
0
import gym
from gym import wrappers
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, './video')

for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        # env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            break
import gym
import numpy as np
from gym import wrappers

import os

env = gym.make('CartPole-v0')

model_dir = './models/model_[50, 50, 50, 50]/1567524154.1553748'

save_dir = model_dir + '/videos'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

env = wrappers.Monitor(env, save_dir, force=True)

BUCKET_SIZE = [50, 50, 50, 50]
LIMIT_VAR = [(env.observation_space.low[0], env.observation_space.high[0]),
             (-4, 4),
             (env.observation_space.low[2], env.observation_space.high[2]),
             (-5, 5)]
WINDOW_SIZE = []
real_size = [(size + 2) if size != 1 else 1 for size in BUCKET_SIZE]

Q_values = np.load(
    './models/model_[50, 50, 50, 50]/1567524154.1553748/model_at_100000_episodes_trained.npy'
)


def chooseAction(discrete_state):
Ejemplo n.º 5
0
# import our training environment
import old_way_moving_cube_env

if __name__ == '__main__':

    rospy.init_node('movingcube_gym', anonymous=True, log_level=rospy.WARN)

    # Create the Gym environment
    env = gym.make('OldMovingCube-v0')
    rospy.loginfo("Gym environment done")

    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('moving_cube_training_pkg')
    outdir = pkg_path + '/training_results'
    env = wrappers.Monitor(env, outdir, force=True)
    rospy.loginfo("Monitor Wrapper started")

    last_time_steps = numpy.ndarray(0)

    # Loads parameters from the ROS param server
    # Parameters are stored in a yaml file inside the config directory
    # They are loaded at runtime by the launch file
    Alpha = rospy.get_param("/moving_cube/alpha")
    Epsilon = rospy.get_param("/moving_cube/epsilon")
    Gamma = rospy.get_param("/moving_cube/gamma")
    epsilon_discount = rospy.get_param("/moving_cube/epsilon_discount")
    nepisodes = rospy.get_param("/moving_cube/nepisodes")
    nsteps = rospy.get_param("/moving_cube/nsteps")

    running_step = rospy.get_param("/moving_cube/running_step")
Ejemplo n.º 6
0
        scores = {k:max(r_pos, r_neg) for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
        order = sorted(scores.keys(), key = lambda x:scores[x]) [0:hp.nb_best_directions] # we don't need to specify 0 (lower bound of range)
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas(k)) for k in order]

        #Updating our policy
        policy.update(rollouts, sigma_r)

        #Printing the final reward of the policy after the update
        reward_evaluation = explore(env, normalizer, policy)
        print('Step: ', step, 'Reward: ', reward_evaluation)

def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')

hp = Hp()
np.random.seed(hp.seed)
env = gym.make(hp.env_name)
env = wrappers.Monitor(env, monitor_dir, force = True)
nb_inputs = env.observation_space.shape[0]
nb_outputs = env.action_space.shape[0]
policy = Policy(nb_inputs, nb_outputs)
normalizer = Normalizer(nb_inputs)
train(env, policy, normalizer, hp)


Ejemplo n.º 7
0
import gym
from time import sleep
from gym import wrappers
env = gym.make('LunarLander-v2')
env = wrappers.Monitor(env, './')
env.seed(0)

g = 1.0
delta_t = 1.0 / 50.0
action = 0

state = env.reset()

y0 = state[1]
v0 = 0
cut_off = 0.01

for t in range(3000):
    env.render()
    state, reward, done, _ = env.step(action)
    y = state[1]
    v = (y - y0) / delta_t
    if done or y < 0 or v == 0.001:
        break

    alt_burn = (y * g + 0.5 * v * v) / (13.0 / env.lander.mass * 0.5)

    v0 = v
    y0 = y
    if y < alt_burn and y > cut_off:
        action = 2
Ejemplo n.º 8
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            video_dir=None):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        env = gym.make(env_name)
        multiagent = False
        try:
            policy_map = {DEFAULT_POLICY_ID: agent.policy}
        except AttributeError:
            raise AttributeError(
                "Agent ({}) does not have a `policy` property! This is needed "
                "for performing (trained) agent rollouts.".format(agent))
        use_lstm = {DEFAULT_POLICY_ID: False}

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    # If monitoring has been requested, manually wrap our environment with a
    # gym monitor, which is set to record every episode.
    if video_dir:
        env = gym_wrappers.Monitor(env=env,
                                   directory=video_dir,
                                   video_callable=lambda x: True,
                                   force=True)

    steps = 0
    episodes = 0
    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {}".format(episodes, reward_total))
        if done:
            episodes += 1
Ejemplo n.º 9
0
import gym
import numpy as np
import tensorflow as tf
from gym import wrappers

#Deep Q-learning algorithm
# 1. Do a feedforward pass for the current state s to get predicted Q-values for all actions.
# 2. Do a feedforward pass for the next state s′ and calculate maximum over all network outputs maxa′Q(s′,a′).
# 3. Set Q-value target for action a to r+γmaxa′Q(s′,a′) (use the max calculated in step 2). For all other actions, set the Q-value target to the same as originally returned from step 1, making the error 0 for those outputs.
# 4. Update the weights using backpropagation.

# creates the frozenlake environment
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, '/tmp/frozenlake-qlearning', force=True)
n_obv = env.observation_space.n
n_acts = env.action_space.n

#neural network
x = tf.placeholder(shape=[1, 16], dtype=tf.float32)
y_ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.1))
y = tf.matmul(x, W)
action = tf.argmax(y, 1)

cost = tf.reduce_sum(tf.square(y_ - y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost)

#tensorflow initialization
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
Ejemplo n.º 10
0
              input_dims=(env.observation_space.shape),
              n_actions=(env.action_space.n),
              mem_size=50000,
              eps_min=0.1,
              batch_size=32,
              replace=1000,
              eps_dec=1e-5,
              checkpoint_dir='models/',
              algo='DQNAgent',
              env_name='PongNoFrameskip-v4')

agent.load_models()
print(agent.q_eval)

env = wrappers.Monitor(env,
                       "tmp/dqn-video",
                       video_callable=lambda episode_id: True,
                       force=True)

n_steps = 0
score = 0
done = False
obs = env.reset()

while not done:
    action = agent.choose_action(obs)
    resulted_obs, reward, done, info = env.step(action)
    score += reward
    obs = resulted_obs
    n_steps += 1
    print(n_steps)
Ejemplo n.º 11
0
def wrap_monitor(env, log_dir):
    env = wrappers.Monitor(env, log_dir, video_callable=lambda x: True)
    return env
Ejemplo n.º 12
0
                   gamma=args.gamma,
                   epsilon=epsilon,
                   epsilon_min=epsilon_min,
                   epsilon_dec=epsilon_dec,
                   memory_size=args.memory_size,
                   batch_size=args.batch_size,
                   replace=args.replace,
                   checkpoint_dir=models_path,
                   algo=agent_path,
                   env_name=args.env_name)

    if load_checkpoint:
        agent.load_models()
        videos_path = os.path.join(videos_path, )
        env = wrappers.Monitor(env,
                               videos_path,
                               video_callable=lambda episode_id: True,
                               force=True)  # force overwrites previous video

    # for saving plot
    # fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' + str(n_games) + 'games'
    if load_checkpoint:
        figure_file = os.path.join(plots_path, 'plot_eval.png')
    else:
        figure_file = os.path.join(plots_path, 'plot.png')

    n_steps = 0
    # steps array is for plotting scores wrt steps, instead of games played
    # because games are highly variable, can be short or long games. Steps are steps.
    scores, eps_history, steps_array = [], [], []

    start = time.time()
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument('-o',
                        '--output',
                        default='atari-v0',
                        help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')
    parser.add_argument('--mode', choices=['train', 'test'], default='train')
    parser.add_argument('--network',
                        choices=['deep', 'linear'],
                        default='deep')
    parser.add_argument('--method',
                        choices=['dqn', 'double', 'dueling'],
                        default='dqn')
    parser.add_argument('--monitor', type=bool, default=False)
    parser.add_argument('--iter', type=int, default=2400000)
    parser.add_argument('--test_policy',
                        choices=['Greedy', 'GreedyEpsilon'],
                        default='GreedyEpsilon')

    args = parser.parse_args()
    args.seed = np.random.randint(0, 1000000, 1)[0]
    args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format(
        args.env, args.method, args.network, args.iter)
    args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format(
        args.env, args.method, args.network, args.iter, args.test_policy)
    if args.mode == 'train':
        args.monitor = False

    env = gym.make(args.env)
    if args.monitor:
        env = wrappers.Monitor(env, args.monitor_path, force=True)
    np.random.seed(args.seed)
    env.seed(args.seed)

    args.gamma = 0.99
    args.learning_rate = 0.0001
    args.epsilon = 0.05
    args.num_iterations = 5000000
    args.batch_size = 32

    args.window_length = 4
    args.num_burn_in = 50000
    args.target_update_freq = 10000
    args.log_interval = 10000
    args.model_checkpoint_interval = 10000
    args.train_freq = 4

    args.num_actions = env.action_space.n
    args.input_shape = (84, 84)
    args.memory_max_size = 1000000

    args.output = get_output_folder(args.output, args.env)

    args.suffix = args.method + '_' + args.network
    if (args.method == 'dqn'):
        args.enable_double_dqn = False
        args.enable_dueling_network = False
    elif (args.method == 'double'):
        args.enable_double_dqn = True
        args.enable_dueling_network = False
    elif (args.method == 'dueling'):
        args.enable_double_dqn = False
        args.enable_dueling_network = True
    else:
        print('Attention! Method Worng!!!')

    if args.test_policy == 'Greedy':
        test_policy = GreedyPolicy()
    elif args.test_policy == 'GreedyEpsilon':
        test_policy = GreedyEpsilonPolicy(args.epsilon)

    print(args)

    K.tensorflow_backend.set_session(get_session())
    model = create_model(args.window_length, args.input_shape,
                         args.num_actions, args.network)

    # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence
    # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame
    Processor = {}
    Processor['Atari'] = AtariPreprocessor(args.input_shape)
    Processor['History'] = HistoryPreprocessor(args.window_length)
    ProcessorSequence = PreprocessorSequence(Processor)  # construct 84x84x4

    # we create our memory for saving all experience collected during training with window length 4
    memory = ReplayMemory(max_size=args.memory_max_size,
                          input_shape=args.input_shape,
                          window_length=args.window_length)

    # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using
    # epsilon with 0.1 to further train the network

    # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first
    # 50000 iterations, we only collect data to the memory and don't update our model.
    policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon),
                                            attr_name='eps',
                                            start_value=1,
                                            end_value=0.1,
                                            num_steps=1000000)

    dqn = DQNAgent(q_network=model,
                   policy=policy,
                   memory=memory,
                   num_actions=args.num_actions,
                   test_policy=test_policy,
                   preprocessor=ProcessorSequence,
                   gamma=args.gamma,
                   target_update_freq=args.target_update_freq,
                   num_burn_in=args.num_burn_in,
                   train_freq=args.train_freq,
                   batch_size=args.batch_size,
                   enable_double_dqn=args.enable_double_dqn,
                   enable_dueling_network=args.enable_dueling_network)

    adam = Adam(lr=args.learning_rate)
    dqn.compile(optimizer=adam)

    if args.mode == 'train':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f'
        log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix)
        log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix)
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=args.model_checkpoint_interval)
        ]
        callbacks += [FileLogger(log_filename, interval=100)]
        callbacks += [
            TensorboardStepVisualization(log_dir=log_dir,
                                         histogram_freq=1,
                                         write_graph=True,
                                         write_images=True)
        ]

        # start training
        # we don't apply action repetition explicitly since the game will randomly skip frame itself
        dqn.fit(env,
                callbacks=callbacks,
                verbose=1,
                num_iterations=args.num_iterations,
                action_repetition=1,
                log_interval=args.log_interval,
                visualize=True)

        dqn.save_weights(weights_filename, overwrite=True)
        dqn.evaluate(env,
                     num_episodes=10,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)
    elif args.mode == 'test':
        weights_filename = 'dqn_{}_weights_{}.h5f'.format(
            args.env, args.suffix)
        if args.weights:
            weights_filename = args.weights
        dqn.load_weights(weights_filename)
        dqn.evaluate(env,
                     num_episodes=250,
                     visualize=True,
                     num_burn_in=5,
                     action_repetition=1)

        # we upload our result to openai gym
        if args.monitor:
            env.close()
Ejemplo n.º 14
0
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess


class MujocoProcessor(WhiteningNormalizerProcessor):
    def process_action(self, action):
        return np.clip(action, -1., 1.)


ENV_NAME = 'HalfCheetah-v2'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
env = wrappers.Monitor(env, '/tmp/{}'.format(ENV_NAME), force=True)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(400))
actor.add(Activation('relu'))
actor.add(Dense(300))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('tanh'))
print(actor.summary())
from sac2019 import SACAgent as SAC
import numpy as np
import os
import torch
import gym
import pybullet_envs
from gym import wrappers

monitor_path = './monitor/'
if not os.path.exists(monitor_path):
    os.makedirs(monitor_path)

env_name = "AntBulletEnv-v0"
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
env = wrappers.Monitor(env, monitor_path, force=True)
start_timesteps = 10_000
eval_freq = 5_000
max_timesteps = 500_000
batch_size = 100

total_timesteps = 0
episode_reward = 0
episode_timesteps = 0
episode_num = 0
done = False
obs = env.reset()

gamma = 0.99
tau = 0.005
alpha = 0.2
Ejemplo n.º 16
0
    return returns


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("env_name", type=str)

    args = parser.parse_args()
    env_name = args.env_name

    env = gym.make(env_name)
    env = wrappers.Monitor(
        env,
        f"Saved_Videos/hw1/dagger/{env_name}/",
        resume=True,
        force=True,
        video_callable=lambda episode: episode % 10 == 0,
    )

    model = load_model(f"./models/hw1/{env_name}.h5")

    file = open(f"./expert_data/{env_name}.pkl", "rb")
    data = pickle.load(file)
    exp_observations, exp_actions = data["observations"], data["actions"]
    policy_fn = load_policy.load_policy(f"./experts/{env_name}.pkl")

    returns = dagger(exp_observations, exp_actions, model, max_steps=1000)

    print(f"returns = {returns}")
    print(f"mean return = {np.mean(returns)}")
                                                   flush_millis=10000,
                                                   filename_suffix="-cartpole")
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    global_step = tf.Variable(0)
    net = Net(hidden_size, obs_size, n_actions)

    for iter_no, batch in enumerate(iterate_batches(env, net, batch_size)):
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, percentile)
        loss_v, grads = grad(net, obs_v, acts_v)
        optimizer.apply_gradients(zip(grads, net.trainable_variables),
                                  global_step)

        print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" %
              (iter_no, loss_v.numpy(), reward_m, reward_b))

        with writer.as_default(), tf.contrib.summary.always_record_summaries():
            tf.contrib.summary.scalar("loss", loss_v.numpy())
            tf.contrib.summary.scalar("reward_bound", reward_b)
            tf.contrib.summary.scalar("reward_mean", reward_m)

        if reward_m > 199:
            print("Solved!")
            break

    env.close()
    writer.close()

    env = gym.make('CartPole-v0')
    env = wrappers.Monitor(env, '/tmp/cartpole-cross-entropy', force=True)
    play(net, env)
    env.close()
Ejemplo n.º 18
0
        return img


def make_env():
    env_spec = gym.spec('ppaquette/DoomCorridor-v0')
    env_spec.id = 'DoomBasic-v0'
    env = env_spec.make()
    e = PreprocessImage(SkipWrapper(4)(ToDiscrete("minimal")(env)),
                        width=80,
                        height=80,
                        grayscale=True)
    return e


env = make_env()
env = wrappers.Monitor(env, './experiment', force=True)

NOOP, SHOOT, RIGHT, LEFT, FORWARD, TURN_R, TURN_L = 0, 1, 2, 3, 4, 5, 6
VALID_ACTIONS = [0, 1, 2, 3, 4, 5, 6]


class Estimator():
    def __init__(self, scope="estimator"):
        self.scope = scope
        with tf.variable_scope(scope):
            self._build_model()

    def _build_model(self):
        self.X_pl = tf.placeholder(shape=[None, 80, 80, 4],
                                   dtype=tf.float32,
                                   name="X")
    avg_length = episode_lengths.mean()
    print("avg length:", avg_length)
    return avg_length


def random_search(env):
    episode_lengths = []
    best = 0
    params = None
    for t in range(100):
        new_params = np.random.random(4) * 2 - 1
        avg_length = play_multiple_episodes(env, 100, new_params)
        episode_lengths.append(avg_length)

        if avg_length > best:
            params = new_params
            best = avg_length
    return episode_lengths, params


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    episode_lengths, params = random_search(env)
    plt.plot(episode_lengths)
    plt.show()

    # play a final set of episodes
    env = wrappers.Monitor(env, 'my_awesome_dir')
    print("***Final run with final weights***:", play_one_episode(env, params))
    def evaluate(self,
                 env,
                 args,
                 num_episodes,
                 eval_count,
                 max_episode_length=None,
                 monitor=True):
        """Test your agent with a provided environment.
        
        You shouldn't update your network parameters here. Also if you
        have any layers that vary in behavior between train/test time
        (such as dropout or batch norm), you should set them to test.

        Basically run your policy on the environment and collect stats
        like cumulative reward, average episode length, etc.

        You can also call the render function here if you want to
        visually inspect your policy.
        """
        print("Evaluation starts.")
        plt.figure(1, figsize=(22.5, 10))

        is_training = False
        if self.load_network:
            # self.q_network.load_weights(self.load_network_path)
            # print("Load network from:", self.load_network_path)
            self.restore_model(self.load_network_path)
        if monitor:
            env = wrappers.Monitor(env,
                                   self.output_path_videos,
                                   video_callable=lambda x: True,
                                   resume=True)
        state = env.reset()

        idx_episode = 1
        episode_frames = 0
        episode_reward = np.zeros(num_episodes)
        t = 0

        while idx_episode <= num_episodes:
            t += 1
            action_state = self.history_processor.process_state_for_network(
                self.atari_processor.process_state_for_network(state))
            action = self.select_action(action_state,
                                        is_training,
                                        policy_type='GreedyEpsilonPolicy')

            action_state_ori = self.history_processor.process_state_for_network_ori(
                self.atari_processor.process_state_for_network_ori(state))
            # print "state.shape", state.shape
            # print "action_state_ori.shape", action_state_ori.shape
            dice = np.random.random()

            state, reward, done, info = env.step(action)

            if dice < 1e-1 and not (args.train):
                alpha_list = self.sess.run(self.q_network.alpha_list,\
                            feed_dict={self.q_network.imageIn: action_state[None, :, :, :], self.q_network.batch_size:1})
                # print alpha_list, len(alpha_list), alpha_list[0].shape #10 (1, 49)
                for alpha_idx in range(len(alpha_list)):
                    plt.subplot(2, len(alpha_list) // 2 + 1, alpha_idx + 1)
                    img = action_state_ori[:, :, :, alpha_idx]  #(210, 160, 3)
                    plt.imshow(img)
                    alp_curr = alpha_list[alpha_idx].reshape(7, 7)
                    alp_img = skimage.transform.pyramid_expand(alp_curr,
                                                               upscale=22,
                                                               sigma=20)
                    plt.imshow(scipy.misc.imresize(
                        alp_img, (img.shape[0], img.shape[1])),
                               alpha=0.7,
                               cmap='gray')
                    plt.axis('off')
                plt.subplot(2, action_state_ori.shape[3] // 2 + 1,
                            action_state_ori.shape[3] + 2)
                plt.imshow(state)
                plt.savefig(
                    '%sattention_ep%d-frame%d.png' %
                    (self.output_path_images, eval_count, episode_frames))
                print '---- Image saved at: %sattention_ep%d-frame%d.png' % (
                    self.output_path_images, eval_count, episode_frames)

            episode_frames += 1
            episode_reward[idx_episode - 1] += reward
            if episode_frames > max_episode_length:
                done = True
            if done:
                print(
                    "Eval: time %d, episode %d, length %d, reward %.0f. @eval_count %s"
                    % (t, idx_episode, episode_frames,
                       episode_reward[idx_episode - 1], eval_count))
                eval_count += 1
                save_scalar(eval_count, 'eval/eval_episode_raw_reward',
                            episode_reward[idx_episode - 1], self.writer)
                save_scalar(eval_count, 'eval/eval_episode_raw_length',
                            episode_frames, self.writer)
                sys.stdout.flush()
                state = env.reset()
                episode_frames = 0
                idx_episode += 1
                self.atari_processor.reset()
                self.history_processor.reset()

        reward_mean = np.mean(episode_reward)
        reward_std = np.std(episode_reward)
        print(
            "Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]"
            % (num_episodes, reward_mean, reward_std))
        sys.stdout.flush()

        return reward_mean, reward_std, eval_count
Ejemplo n.º 21
0
import gym
from gym import wrappers
import numpy as np

env = gym.make("FrozenLake-v0")
env = wrappers.Monitor(env, "./results", force=True)

Q = np.zeros([env.observation_space.n, env.action_space.n])
n_s_a = np.zeros([env.observation_space.n, env.action_space.n])

num_episodes = 100000
epsilon = 0.2
rList = []

for i in range(num_episodes):
    state = env.reset()
    rAll = 0
    done = False
    results_list = []
    result_sum = 0.0
    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
        new_state, reward, done, _ = env.step(action)
        results_list.append((state, action))
        result_sum += reward
        state = new_state
        rAll += reward
    rList.append(rAll)
Ejemplo n.º 22
0
 def start_record(self, render=False):
     if not render:
         self.env.render(close=True)
     self.env = wrappers.Monitor(self.env, self.monitor_path, force=True)
Ejemplo n.º 23
0
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse,
                 prefix):
    def get_checkpoint_dir(checkpoint_list, limit, prefix):
        for checkpoint in checkpoint_list:
            if ('limitation_' + str(limit) in checkpoint) and (prefix
                                                               in checkpoint):
                return checkpoint
        return None

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=policy_hidden_size,
                                    num_hid_layers=2)

    data_path = os.path.join('data',
                             'deterministic.trpo.' + env_name + '.0.00.npz')
    dataset = load_dataset(data_path)
    checkpoint_list = glob.glob(
        os.path.join('checkpoint', '*' + env_name + ".*"))
    log = {
        'traj_limitation': [],
        'upper_bound': [],
        'avg_ret': [],
        'avg_len': [],
        'normalized_ret': []
    }
    for i, limit in enumerate(CONFIG['traj_limitation']):
        # Do one evaluation
        upper_bound = sum(dataset.rets[:limit]) / limit
        checkpoint_dir = get_checkpoint_dir(checkpoint_list,
                                            limit,
                                            prefix=prefix)
        checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
        env = gym.make(env_name +
                       '-v1')  # FIX: with MuJoCo 1.50, MuJoCo envs are -v2
        env = wrappers.Monitor(
            env, checkpoint_dir, force=True
        )  # ENHANCEMENT: Generate and save videos to checkpoint_dir
        # Errors with ERROR: GLEW initalization error: Missing GL version on MuJoCo 1.50, set LD_PRELOAD
        # https://github.com/openai/mujoco-py/issues/44
        env.seed(seed)
        print('Trajectory limitation: {}, Load checkpoint: {}, '.format(
            limit, checkpoint_path))
        avg_len, avg_ret = run_mujoco.runner(env,
                                             policy_fn,
                                             checkpoint_path,
                                             timesteps_per_batch=1024,
                                             number_trajs=10,
                                             stochastic_policy=stochastic,
                                             reuse=((i != 0) or reuse))
        normalized_ret = avg_ret / upper_bound
        print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'.
              format(upper_bound, avg_ret, normalized_ret))
        log['traj_limitation'].append(limit)
        log['upper_bound'].append(upper_bound)
        log['avg_ret'].append(avg_ret)
        log['avg_len'].append(avg_len)
        log['normalized_ret'].append(normalized_ret)
        env.close()
    return log
Ejemplo n.º 24
0
def sim_agent(env,
              policy,
              task,
              scaler,
              num_episodes_sim=1,
              animate=False,
              save_video=False,
              out_dir='./video'):
    """ Simulates trainned agent (Policy) in given environment (env)

    Args:
        env: ai gym environment
        policy: policy object with sample() method
        task: int indicating which head (task specific hidden layer) of the policy to use
        num_episodes_sim (int): number of episodes  to simulate
        animate (bool): determines if video should be rendered in window
        save_video (bool): enables saving video and other stats of simulated episodes

    Returns:
        mean_reward_episodes (double): Mean reward obtained across all episodes
        if save_video=True, stores videos and stats in folder determined by 'out_dir'

    """

    # Monitoring Config
    if save_video:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)  # create directory if it doesn't exist
        env = wrappers.Monitor(env, out_dir,
                               force=True)  # Used to save log data and video

    # Simulate each Episode
    episodes_tot_reward = []
    for episode in range(num_episodes_sim):

        obs = env.reset()
        reward_sum = 0
        done = False
        step = 0.0
        scale, offset = scaler.get()  # standardize observations
        scale[-1] = 1.0  # don't scale time the "step" additional feature
        offset[-1] = 0.0  # don't offset time the "step" additional feature

        # Start Env. Simulation
        while not done:
            if animate: env.render()

            # Modify Observation: Additoinal feature + standardizing based on running mean / std
            obs = obs.astype(np.float32).reshape((1, -1))
            obs = np.append(obs, [[step]],
                            axis=1)  # add time step as additonal feature
            obs = (obs - offset) * scale  # center and scale observations

            # Act based on Policy Network
            action = policy.sample(obs, task).reshape(
                (1, -1)).astype(np.float32)
            obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
            reward_sum += reward
            step += 1e-3  # increment time step feature

        # Accumualte info for Episode
        episodes_tot_reward.append(reward_sum)

    # Get Stats over all episodes
    mean_reward_episodes = np.mean(episodes_tot_reward)

    return mean_reward_episodes
Ejemplo n.º 25
0
            G = np.dot(multiplier, quess_rewards)
            model.update(states[0], actions[0], G)
            rewards.pop(0)
            actions.pop(0)
            states.pop(0)

    return totalreward


if __name__ == "__main__":
    env = gym.make("MountainCar-v0")
    ft = FeatureTransformer(env)
    model = Model(env, ft)
    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = 'Videos/' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)
    gamma = 0.99
    N = 300
    totalrewards = np.empty(N)
    for n in range(N):
        eps = 1.0 / (0.1 * n + 1)
        totalreward = play_one_episode(model, eps, gamma)
        totalrewards[n] = totalreward
        print("episode:", n, "total reward:", totalreward, "eps: ", eps)
    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", -totalrewards.sum())

    plt.plot(totalrewards)
    plt.show()
                .format(episode, n_train_episodes, episode_rewards, epsilon))
            break

    rewards.append(episode_rewards)
    epsilon = update_epsilon(epsilon)

# PLOT RESULTS
x = range(n_train_episodes)
plt.plot(x, rewards)
plt.xlabel('Episode number')
plt.ylabel('Training cumulative reward')
plt.savefig('DQN_CartPole.png', dpi=300)
plt.show()

# TEST PHASE
env = wrappers.Monitor(env, './videos/' + str(time()) + '/')
for episode in range(n_test_episodes):
    current_state = env.reset()
    current_state = preprocess_state(current_state)
    episode_rewards = 0

    for t in range(n_steps):
        env.render()
        action = greedy_policy(current_state)
        next_state, reward, done, _ = env.step(action)
        next_state = preprocess_state(next_state)
        memory.append((current_state, action, reward, next_state, done))
        current_state = next_state
        episode_rewards += reward

        if done:
Ejemplo n.º 27
0
                        action="store",
                        default=500,
                        help="Nombre d\'épisodes d\'apprentissage.")
    parser.add_argument("-t",
                        "--test",
                        type=int,
                        action="store",
                        default=0,
                        help="Nombre d\'épisodes de test")
    args = parser.parse_args()
    logger.set_level(logger.INFO)
    env = gym.make("BreakoutNoFrameskip-v4")

    # Monitor gym pour la vidéo
    outdir = '/tmp/random-agent-results'
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)

    # hyperparamètres
    EXPLO = ["greedy", "boltzmann"]
    TARGET_UPDATE = ["freq", "polyak"]
    PARAMS = {
        "gamma": 0.8,
        "max_tau": 1,
        "min_tau": 0.1,
        "tau_decay": 0.99,
        "exploration": EXPLO[0],
        "sigma": 1e-3,
        "alpha": 0.005,
        "m": 4,
        "frame_skip": 4,
Ejemplo n.º 28
0
    print(env.action_space)  # Quelles sont les actions possibles
    print(env.step(1))  # faire action 1 et retourne l'observation, le reward, et un done un booleen (jeu fini ou pas)
    env.render()  # permet de visualiser la grille du jeu 
    env.render(mode="human") #visualisation sur la console
    #statedic, mdp = env.getMDP()  # recupere le mdp : statedic
    #print("Nombre d'etats : ",len(statedic))  # nombre d'etats ,statedic : etat-> numero de l'etat
    #state, transitions = list(mdp.items())[0]
    #print(state)  # un etat du mdp
    #print(transitions)  # dictionnaire des transitions pour l'etat :  {action-> [proba,etat,reward,done]}

    # Execution avec un Agent
    agent = RandomAgent(env.action_space)

    # Faire un fichier de log sur plusieurs scenarios
    outdir = 'gridworld-v0/random-agent-results'
    envm = wrappers.Monitor(env, directory=outdir, force=True, video_callable=False)
    
    env.seed()  # Initialiser le pseudo aleatoire
    episode_count = 2000
    reward = 0
    done = False
    rsum = 0
    FPS = 0.0001
    all_rsum = []
    for i in range(episode_count):
        obs = envm.reset()
        env.verbose = (i % 100 == 0 and i > 0)  # afficher 1 episode sur 100
        if env.verbose:
            env.render(FPS)
        j = 0
        #rsum = 0
Ejemplo n.º 29
0
    done = False
    reward = 0.0
    while not done:
        action = scale_action(env, agent.step(obs, reward))
        obs, reward, done, _ = env.step(action)
        reward_tot += reward
        env.render()
    agent.step(obs, reward)
    agent.reset()
    return reward_tot


if __name__ == '__main__':
    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    env_to_wrap = gym.make("Pendulum-v0")
    env = wrappers.Monitor(env_to_wrap,
                           'logging/',
                           force=True,
                           video_callable=lambda episode_id: True)

    agent = ModelAgent(env.observation_space.shape[0],
                       env.action_space.shape[0])

    for i in range(10):
        reward_tot = run_episode(env, agent)
        print("Episode: ", i + 1, "---", "Total Reward: ", reward_tot)
    env.close()
Ejemplo n.º 30
0
 def eva(self):
     agent = DDQN_Agent(n_states=self.n_states,
                        n_actions=self.n_actions,
                        batch_size=self.config.batch_size,
                        hidden_size=self.config.hidden_size,
                        memory_size=self.config.memory_size,
                        update_step=self.config.update_step,
                        learning_rate=self.config.learning_rate,
                        gamma=self.config.gamma,
                        tau=self.config.tau)
     test_reward_array = np.zeros(100)
     # load check point to restore the model
     agent.policy_model.load_state_dict(
         torch.load(self.config.DDQN_CHECKPOINT_PATH,
                    map_location=agent.device))
     t = trange(self.config.test_episodes)
     for episode in t:
         state = self.env.reset()
         done = False
         rewards = 0
         while not done:
             # disable epsilon greedy search
             action = agent.act(state, epsilon=0)
             state, reward, done, _ = self.env.step(action)
             rewards += reward
         t.set_description('Episode {:.2f} Reward {:.2f}'.format(
             episode + 1, rewards))
         t.refresh()
         test_reward_array[episode] = rewards
     self.env.close()
     # show the evaluation results
     avg_test_reward = round(np.mean(test_reward_array), 2)
     plt.subplots(figsize=(5, 5), dpi=100)
     plt.plot(test_reward_array)
     plt.ylabel('Total Reward', fontsize=12)
     plt.xlabel('Trial', fontsize=12)
     plt.xticks(fontsize=12)
     plt.yticks(fontsize=12)
     plt.title(
         'Total Rewards Per Trial for 100 Trials - Average: {:.2f}'.format(
             avg_test_reward),
         fontsize=12)
     plt.savefig(self.config.DDQN_RESULT_IMG_PATH.format(1),
                 dpi=100,
                 bbox_inches='tight')
     print('\nSave evaluation rewards plot as {}.'.format(
         self.config.DDQN_RESULT_IMG_PATH.format(1)))
     # play a round
     env = wrappers.Monitor(self.env,
                            self.config.DDQN_AGENT_PATH,
                            force=True)
     state = env.reset()
     done = False
     rewards = 0.
     while not done:
         # disable epsilon greedy search
         action = agent.act(state, epsilon=0)
         state, reward, done, _ = env.step(action)
         rewards += reward
     env.close()
     print('Total rewards in a game: {:.2f}'.format(rewards))
     print('Save video record to {}.'.format(self.config.DDQN_AGENT_PATH))