コード例 #1
0
def run_experiment(env_str, trials=10, max_steps=2000, render=False):

    successes = []

    for trial in range(trials):
        print("--------------------------")
        env = gym.make(env_str)
        env._max_episode_steps = max_steps

        env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space

        reward, steps, success = control_lqr_finite_differences(
            env, max_steps, render)
        print("Reward = {}".format(reward))
        successes.append(success)
        env.close()

        # Make sure rendering window is shut.
        time.sleep(1.0)

    pct_success = successes.count(True) / float(len(successes))
    pct_failure = successes.count(False) / float(len(successes))

    print("********************")
    print(successes)
    print("% success: " + str(pct_success))
    print("% failure: " + str(pct_failure))
コード例 #2
0
ファイル: mpc_engine.py プロジェクト: abbyvansoest/thesis
def run_cartpole_experiment(M=20,
                            k=100,
                            max_steps=2000,
                            trials=100,
                            render=False):

    exp_steps = []
    step_threshold = 100

    for trial in range(trials):
        env = gym.make("CartPole-v1")
        env._max_episode_steps = max_steps

        env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space

        reward, steps, success = mpc.control_random_mpc_cartpole(
            env, M, k, render)
        exp_steps.append(steps)
        env.close()

        # Make sure rendering window is shut.
        time.sleep(1.0)

    successes = sum(s >= step_threshold for s in exp_steps)
    failures = len(exp_steps) - successes

    pct_success = successes / float(trials)
    pct_failure = failures / float(trials)

    print("********************")
    print(exp_steps)
    print("% success: " + str(pct_success))
    print("% failure: " + str(pct_failure))
コード例 #3
0
ファイル: mpc_engine.py プロジェクト: abbyvansoest/thesis
def run_pendulum_experiment(M=10,
                            k=2000,
                            max_steps=2000,
                            trials=100,
                            render=False):

    rewards = []

    for trial in range(trials):
        env = gym.make("Pendulum-v0")
        env._max_episode_steps = max_steps

        env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space

        reward, steps, success = mpc.control_random_mpc_pendulum(
            env, M, k, render)
        rewards.append(reward)
        env.close()

        # Make sure rendering window is shut.
        time.sleep(1.0)

    print("********************")
    print(rewards)
コード例 #4
0
ファイル: base_agent.py プロジェクト: ericl/yarlp
    def __init__(self,
                 env,
                 discount_factor=0.99,
                 log_dir=None,
                 seed=None,
                 gae_lambda=0,
                 reward_len=100):
        """
        discount_factor : float
            Discount rewards by this factor
        """
        # Discount factor
        assert discount_factor >= 0 and discount_factor <= 1
        self._discount = discount_factor
        self.gae_lambda = 0

        self.log_dir = log_dir
        self.reward_len = reward_len
        self.set_logger(log_dir, reward_len)

        if seed is not None:
            self.logger.logger.info('Seed: {}'.format(seed))
            tf_utils.set_global_seeds(seed)
            env.seed(seed)
            prng.seed(seed)

        # any tensorflow models should be cached and serialized separately
        self.tf_object_attributes = set()
        self.unserializables = set(['logger', 'replay_buffer'])

        self._env = env
        self._env_id = '{}_gym{}'.format(env.spec.id, gym.__version__)
コード例 #5
0
ファイル: mpc_engine.py プロジェクト: abbyvansoest/thesis
def run_mountaincar_discrete_experiment(M=20,
                                        k=2000,
                                        max_steps=2000,
                                        trials=100,
                                        render=False):

    successes = []

    for trial in range(trials):
        env = gym.make("MountainCar-v0")
        env._max_episode_steps = max_steps

        env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space

        reward, steps, success = mpc.control_random_mpc_mountaincar_discrete(
            env, M, k, render)
        successes.append(success)
        env.close()

        # Make sure rendering window is shut.
        time.sleep(1.0)

    print(successes)

    pct_success = successes.count(True) / float(len(successes))
    pct_failure = successes.count(False) / float(len(successes))

    print("********************")
    print("% success: " + str(pct_success))
    print("% failure: " + str(pct_failure))
コード例 #6
0
def basic_segments_from_rand_rollout(
        env_id,
        make_env,
        n_desired_segments,
        clip_length_in_seconds,
        # These are only for use with multiprocessing
        seed=0,
        _verbose=True,
        _multiplier=1):
    """ Generate a list of path segments by doing random rollouts. No multiprocessing. """
    segments = []
    env = make_env(env_id)
    env.seed(seed)
    space_prng.seed(seed)
    segment_length = int(clip_length_in_seconds * env.fps)
    while len(segments) < n_desired_segments:
        path = do_rollout(env, random_action)
        # Calculate the number of segments to sample from the path
        # Such that the probability of sampling the same part twice is fairly low.
        segments_for_this_path = max(
            1, int(0.25 * len(path["obs"]) / segment_length))
        for _ in range(segments_for_this_path):
            segment = sample_segment_from_path(path, segment_length)
            if segment:
                segments.append(segment)

            if _verbose and len(segments) % 10 == 0 and len(segments) > 0:
                print("Collected %s/%s segments" %
                      (len(segments) * _multiplier,
                       n_desired_segments * _multiplier))

    if _verbose:
        print("Successfully collected %s segments" %
              (len(segments) * _multiplier))
    return segments
コード例 #7
0
    def __init__(self, env, gamma, lr, obs_dim, action_dim):
        super(AntEntropyPolicy, self).__init__()

        self.affine1 = nn.Linear(obs_dim, 128)
        self.middle = nn.Linear(128, 128)
        self.mu = nn.Linear(128, action_dim)
        self.sigma = nn.Linear(128, action_dim)

        torch.nn.init.xavier_uniform_(self.affine1.weight)
        torch.nn.init.xavier_uniform_(self.middle.weight)
        torch.nn.init.xavier_uniform_(self.mu.weight)
        torch.nn.init.xavier_uniform_(self.sigma.weight)

        self.saved_log_probs = []
        self.rewards = []

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.eps = np.finfo(np.float32).eps.item()

        self.env = env
        self.gamma = gamma
        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.env.env.set_state(ant_utils.qpos, ant_utils.qvel)
        self.init_state = np.array(self.env.env.state_vector())
        self.env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space
コード例 #8
0
def load_frozen_lake():
    env = FL.FrozenLakeEnv()
    env.seed(0)
    prng.seed(10)
    np.random.seed(0)
    np.set_printoptions(precision=3)
    print(env.__doc__)
    env.demonstrate()
    return env
コード例 #9
0
ファイル: collect.py プロジェクト: abbyvansoest/thesis
def main():

    save = False

    # Suppress scientific notation.
    np.set_printoptions(suppress=True, edgeitems=100)

    # Make environment.
    env = gym.make(args.env)
    env.seed(int(time.time())) # seed environment
    prng.seed(int(time.time())) # seed action space

    # Set up logging to file 
    TIME = datetime.now().strftime('%Y_%m_%d-%H-%M')
    LOG_DIR = 'logs-' + args.env + '/'
    if not os.path.exists(LOG_DIR):
        os.mkdir(LOG_DIR)

    FILE_NAME = 'test' + TIME
    logging.basicConfig(level=logging.DEBUG,
                        format='%(message)s',
                        datefmt='%m-%d %H:%M',
                        filename=LOG_DIR + FILE_NAME + '.log',
                        filemode='w')
    logger = logging.getLogger(args.env + '-curiosity.pt')

    MODEL_DIR = 'models-' + args.env + '/models_' + TIME + '/'
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    # save metadata from the run. 
    with open(MODEL_DIR + "metadata", "w") as metadata:
        metadata.write("args: %s\n" % args)
        metadata.write("num_states: %s\n" % str(utils.num_states))
        metadata.write("state_bins: %s\n" % utils.state_bins)

    policies, running_avg_entropies, entropies, running_avg_ps, average_ps = collect_entropy_policies(env, args.epochs, args.T, MODEL_DIR, logger)
    plotting.generate_figures(args.env, MODEL_DIR, running_avg_entropies, entropies, running_avg_ps, average_ps)

    exploration_policy = average_policies(env, policies)
    if (args.collect_video):
        MODEL_DIR = ''
    # average_p = exploration_policy.execute(args.T, render=True, save_video_dir=MODEL_DIR+'videos/epoch_' + str(args.epochs) + '/')
    overall_avg_ent = scipy.stats.entropy(average_p.flatten())

    # average_p = curiosity.execute_average_policy(env, policies, args.T, render=True)

    log_iteration('average', logger, average_p, [])
    print('*************')
    print(np.reshape(average_p, utils.space_dim))

    print("overall_avg_ent = %f" % overall_avg_ent)

    env.close()

    print("DONE")
コード例 #10
0
ファイル: pendulum_ilqr.py プロジェクト: abbyvansoest/thesis
    def __init__(self):
        self.env = gym.make("Pendulum-v0")
        self.env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space

        self.initial_state = self.env.reset()

        self.state_size = len(self.env.observation_space.sample())
        self.action_size = 1

        self.x_goal = np.array([np.sin(0), np.cos(0), 0])
コード例 #11
0
    def __init__(self):
        self.env = gym.make("MountainCarContinuous-v0")
        self.env.seed(int(time.time())) # seed environment
        prng.seed(int(time.time())) # seed action space

        self.initial_state = self.env.reset()
        print("initial_state: " + str(self.initial_state))

        self.state_size = len(self.env.observation_space.sample())
        self.action_size = 1

        self.x_goal = np.array([0.50, 1])
        self.cost = 0
コード例 #12
0
def main():

    # Suppress scientific notation.
    np.set_printoptions(suppress=True, edgeitems=100)

    # Make environment.
    env = gym.make("HalfCheetah-v2")
    env.seed(int(time.time()))  # seed environment
    prng.seed(int(time.time()))  # seed action space

    # Set up experiment variables.
    T = 10000
    avg_runs = 10

    policies = load_from_dir(args.models_dir)

    for t in range(1, len(policies)):
        avg_state_dict = collect.average_policies(policies[:t])
        exploration_policy = CheetahEntropyPolicy(env, args.gamma)
        exploration_policy.load_state_dict(avg_state_dict)

        average_p = exploration_policy.execute(T)
        for i in range(avg_runs):
            average_p += exploration_policy.execute(T)
        average_p /= float(avg_runs)
        ent_average_p = scipy.stats.entropy(average_p.flatten())

        print('---------------------')
        print("Average policies[:%d]" % t)
        # print(average_p)
        print(ent_average_p)

    # obtain average policy.
    average_policy_state_dict = collect.average_policies(policies)
    exploration_policy = CheetahEntropyPolicy(env, args.gamma)
    exploration_policy.load_state_dict(average_policy_state_dict)
    average_p = exploration_policy.execute(T)

    print('*************')
    print(np.reshape(average_p, utils.space_dim))

    # Now, learn the actual reward structure based on environment rewards.
    # actual_policy = ExplorePolicy(env, utils.obs_dim, utils.action_dim, exploration_policy, args.lr, args.gamma, args.eps)
    # actual_policy.learn_policy(args.episodes, args.train_steps)
    # actual_policy.execute(T, render=True)
    # actual_policy.save()

    env.close()
コード例 #13
0
    def __init__(self, env, gamma, lr, obs_dim, action_dim):
        super(AntActorCritic, self).__init__()

        self.linear1 = nn.Linear(obs_dim, 200)
        self.lstm = nn.LSTMCell(200, 128)
        # Actor
        self.mu_linear = nn.Linear(128, action_dim)
        self.sigma_sq_linear = nn.Linear(128, action_dim)
        # Critic
        self.value_linear = nn.Linear(128, 1)

        # initialize weight
        self.apply(weights_init)
        self.mu_linear.weight.data = normalized_columns_initializer(
            self.mu_linear.weight.data, 0.01)
        self.sigma_sq_linear.weight.data = normalized_columns_initializer(
            self.sigma_sq_linear.weight.data, 0.01)
        self.mu_linear.bias.data.fill_(0)
        self.sigma_sq_linear.bias.data.fill_(0)

        self.value_linear.weight.data = normalized_columns_initializer(
            self.value_linear.weight.data, 1.0)
        self.value_linear.bias.data.fill_(0)

        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)

        self.train()

        self.saved_log_probs = []
        self.rewards = []
        self.values = []
        self.entropies = []

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.eps = np.finfo(np.float32).eps.item()

        self.env = env
        self.gamma = gamma
        self.tau = 1.00
        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.env.env.set_state(ant_utils.qpos, ant_utils.qvel)
        self.init_state = np.array(self.env.env.state_vector())
        self.env.seed(int(time.time()))  # seed environment
        prng.seed(int(time.time()))  # seed action space
コード例 #14
0
def main():
    # Suppress scientific notation.
    np.set_printoptions(suppress=True)

    # Make environment.
    env = gym.make(args.env)
    env.seed(int(time.time()))  # seed environment
    prng.seed(int(time.time()))  # seed action space

    # Set up experiment variables.
    T = 1000
    avg_runs = 10

    policies = load_from_dir(args.models_dir)

    times = []
    entropies = []

    x_dist_times = []
    x_distributions = []

    v_dist_times = []
    v_distributions = []

    for t in range(1, len(policies)):

        average_p, avg_entropy = average_p_and_entropy(policies[:t], avg_runs)

        print('---------------------')
        print("Average policies[:%d]" % t)
        print(average_p)
        print(avg_entropy)

    # obtain global average policy.
    exploration_policy = collect.average_policies(env, policies)
    average_p = exploration_policy.execute(T)

    print('*************')
    print(average_p)

    # actual_policy = ExplorePolicy(env, obs_dim, action_dim, exploration_policy, args.lr, args.gamma)
    # actual_policy.learn_policy(args.episodes, args.train_steps)
    # actual_policy.execute(T, render=True)
    # actual_policy.save()

    env.close()
コード例 #15
0
def main():

    # Suppress scientific notation.
    np.set_printoptions(suppress=True, edgeitems=100)

    # Make environment.
    env = gym.make(args.env)
    # TODO: limit acceleration (maybe also speed?) for Pendulum.
    if args.env == "Pendulum-v0":
        env.env.max_speed = 8
        env.env.max_torque = 1
    env.seed(int(time.time()))  # seed environment
    prng.seed(int(time.time()))  # seed action space

    TIME = datetime.now().strftime('%Y_%m_%d-%H-%M')
    MODEL_DIR = 'models-' + args.env + '/models_' + TIME + '/'

    if args.save_models:
        if not os.path.exists(MODEL_DIR):
            os.makedirs(MODEL_DIR)

        # save metadata from the run.
        with open(MODEL_DIR + "metadata", "w") as metadata:
            metadata.write("args: %s\n" % args)
            metadata.write("num_states: %s\n" % str(utils.num_states))
            metadata.write("state_bins: %s\n" % utils.state_bins)

    plotting.FIG_DIR = 'figs/' + args.env + '/'
    plotting.model_time = 'models_' + TIME + '/'
    if not os.path.exists(plotting.FIG_DIR + plotting.model_time):
        os.makedirs(plotting.FIG_DIR + plotting.model_time)

    policies = collect_entropy_policies(env, args.epochs, args.T, MODEL_DIR)

    exploration_policy = average_policies(env, policies)

    # Final policy:
    # average_p, _, _ = curiosity.execute_average_policy(env, policies, args.T)
    # overall_avg_ent = scipy.stats.entropy(average_p.flatten())
    # print('*************')
    # print(np.reshape(average_p, utils.space_dim))
    # print("overall_avg_ent = %f" % overall_avg_ent)

    env.close()

    print("DONE")
def run_environment_episode(env, pi, seed, model_file, max_timesteps, render,
                            stochastic):
    number_of_timestep = 0
    done = False

    # load model
    my_tf_util.load_state(model_file)

    # set seed
    set_global_seeds(seed)
    env.seed(seed)

    import gym.spaces.prng as prng
    prng.seed(seed)

    obs = env.reset()

    cum_reward = []
    observations = []
    distance = []
    cum_rew_p = []

    # max_timesteps is set to 1000
    while (not done) and number_of_timestep < max_timesteps:

        action, _ = pi.act(stochastic, obs)

        obs, reward, done, info = env.step(action)

        observations.append(obs)

        cum_reward.append(reward)

        distance.append(info["distance_delta"])

        cum_rew_p.append(info["rew_p"])

        # render
        if render:
            env.render()

        number_of_timestep += 1

    return observations, cum_reward, distance, cum_rew_p
コード例 #17
0
def example(env):
	"""Show an example of gym
	Parameters
		----------
		env: gym.core.Environment
			Environment to play on. Must have nS, nA, and P as
			attributes.
	"""
	env.seed(0);
	from gym.spaces import prng; prng.seed(10) # for print the location
	# Generate the episode
	ob = env.reset()
	for t in range(100):
		env.render()
		a = env.action_space.sample()
		ob, rew, done, _ = env.step(a)
		if done:
			break
	assert done
	env.render();
コード例 #18
0
def main():
    learning_rate = 0.1
    epochs = 20

    gamma = 1
    horizon = 200
    traj_len = 15

    env = FrozenLakeEnvMultigoal(goal=2)
    env.seed(0)
    prng.seed(10)
    mdp1 = MDP(FrozenLakeEnvMultigoal(is_slippery=False, goal=1))
    r1 = np.zeros(mdp1.nS)
    r1[-1] = 1
    print('Reward used to generate expert trajectories: ', r1)

    policy1 = compute_policy(mdp1, gamma, r1, threshold=1e-8, horizon=horizon)
    trajectories1 = generate_trajectories(mdp1, policy1, traj_len, 200)
    print('Generated ', trajectories1.shape[0], ' traj of length ', traj_len)

    sa_visit_count, _ = compute_s_a_visitations(mdp1, gamma, trajectories1)
    print(
        'Log likelihood of all traj under the policy generated',
        'from the original reward: ', np.sum(sa_visit_count * np.log(policy1)),
        'average per traj step: ',
        np.sum(sa_visit_count * np.log(policy1)) /
        (trajectories1.shape[0] * trajectories1.shape[1]), '\n')

    r = np.random.rand(mdp1.nS)
    print('Randomly initialized reward: ', r)

    r = max_causal_ent_irl(mdp1,
                           gamma,
                           trajectories1,
                           epochs,
                           learning_rate,
                           r=r,
                           horizon=horizon)

    print('Final reward: ', r)
コード例 #19
0
def main():

    # Suppress scientific notation.
    np.set_printoptions(suppress=True, edgeitems=100)

    # Make environment.
    env = gym.make(args.env)
    env.seed(int(time.time()))  # seed environment
    prng.seed(int(time.time()))  # seed action space

    # Set up saving models.
    # TIME = datetime.now().strftime('%Y_%m_%d-%H-%M')
    # MODEL_DIR = 'models-' + args.env + '/models_' + TIME + '/'
    # if not os.path.exists(MODEL_DIR):
    #     os.makedirs(MODEL_DIR)

    # # save metadata from the run.
    # with open(MODEL_DIR + "metadata", "w") as metadata:
    #     metadata.write("args: %s\n" % args)
    #     metadata.write("num_states: %s\n" % str(ant_utils.num_states))
    #     metadata.write("state_bins: %s\n" % ant_utils.state_bins)

    policies = collect_entropy_policies(env, args.epochs, args.T)

    exploration_policy = average_policies(env, policies)
    if (args.collect_video):
        MODEL_DIR = ''
    # average_p = exploration_policy.execute(args.T, render=True, save_video_dir=MODEL_DIR+'videos/epoch_' + str(args.epochs) + '/')
    overall_avg_ent = scipy.stats.entropy(average_p.flatten())

    # average_p = curiosity.execute_average_policy(env, policies, args.T, render=True)

    print('*************')
    # print(np.reshape(average_p, ant_utils.space_dim))

    print("overall_avg_ent = %f" % overall_avg_ent)

    env.close()

    print("DONE")
コード例 #20
0
ファイル: template.py プロジェクト: Andrewthe13th/AI-Project
            ['right', 'A', 'B'],
        ]
        self.children = []

    def AddChild(self, randomAction)
        children.append(randomAction)
        self.unvisitedActions.remove()

    def selectRandomAction(self, randomValue)
        #not random right now
        unvisitedActions[]

state = env.reset()

#use same seed to see same outcomes
prng.seed(1337)

# FIRST STEP OCCURED REGARDLESS
state, reward, done, info = env.step(env.action_space.sample())
#env.render()

#save the inital life number
lifeNum = info["life"] # always supposed to be 3

# check if the level has been completed
while(info['flag_get'] == False):
    state, reward, done, info = env.step(env.action_space.sample())
    # render the action/frame that occured
    #env.render()

    print(info["life"])
コード例 #21
0
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

"""

"""
# this is required to generate different starting positions
prng.seed(1337)

env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
コード例 #22
0
print(env.__doc__)
print("")

#################################
# Some basic imports and setup
# Let's look at what a random episode looks like.

import numpy as np, numpy.random as nr, gym
import matplotlib.pyplot as plt
#%matplotlib inline
np.set_printoptions(precision=3)

# Seed RNGs so you get the same printouts as me
env.seed(0)
from gym.spaces import prng
prng.seed(10)
# Generate the episode
env.reset()
for t in range(100):
    env.render()
    a = env.action_space.sample()
    ob, rew, done, _ = env.step(a)
    if done:
        break
assert done
env.render()

#################################
# Create MDP for our env
# We extract the relevant information from the gym Env into the MDP class below.
# The `env` object won't be used any further, we'll just use the `mdp` object.
コード例 #23
0
 def _seed(self, seed=None):
     super(KukaPoseEnv, self)._seed(seed)
     prng.seed(seed)
コード例 #24
0
env = gym.make('FrozenLake-v0')
env = env.env
print(env.__doc__)
print("")

#################################
# Some basic imports and setup
# Let's look at what a random episode looks like.

import numpy as np, numpy.random as nr, gym
import matplotlib.pyplot as plt
#%matplotlib inline
np.set_printoptions(precision=3)

# Seed RNGs so you get the same printouts as me
env.seed(0); from gym.spaces import prng; prng.seed(10)
# Generate the episode
env.reset()
for t in range(100):
    env.render()
    a = env.action_space.sample()
    ob, rew, done, _ = env.step(a)
    if done:
        break
assert done
env.render();

#################################
# Create MDP for our env
# We extract the relevant information from the gym Env into the MDP class below.
# The `env` object won't be used any further, we'll just use the `mdp` object.
def run(seed_num, seed_path, run_path, run_type, train=True):
    env = make_atari(ENV_NAME)
    env = wrap_deepmind(env, frame_stack=True, scale=False)
    env.seed(seed_num)
    seed(seed_num)

    dqn_agent = DDQN(env, run_path=run_path)
    dqn_agent.train_target_network()
    obs = env.reset().__array__(dtype=np.uint8)

    if train:

        state = load_state(run_path)

        if state is not None:
            ep = state['ep']
            t_start = state['t']
            ep_steps = state['ep_steps']
            # ep_score = state['score']
            std_dev_score = state['std_dev_score']
            dqn_agent.epsilon = state['epsilon']
            avg_score = state['avg_score']
            best_avg_score = state['best_avg_score']
            replay_fill_size = state['replay_fill_size']
        else:
            ep = 0
            t_start = 0
            ep_steps = 0
            replay_fill_size = 0
            # ep_score = 0
            std_dev_score = 0
            avg_score = -21
            best_avg_score = -21

        plt_data = load_plot_data(run_path)

        if plt_data is not None:
            avg_score_vals = plt_data['avg_score_vals']
            epsilon_vals = plt_data['epsilon_vals']
            best_avg_score_vals = plt_data['best_avg_score_vals']
            std_dev_score_vals = plt_data['std_dev_score_vals']
            replay_fill_size_vals = plt_data['replay_fill_size_vals']
            score_window = plt_data['score_window']
        else:
            avg_score_vals = deque()
            epsilon_vals = deque()
            # epsilon_vals.append(dqn_agent.epsilon)

            best_avg_score_vals = deque()
            std_dev_score_vals = deque()
            replay_fill_size_vals = deque()
            score_window = deque(maxlen=args.score_window_size)

            avg_score_vals.append(avg_score)
            epsilon_vals.append(dqn_agent.epsilon)
            best_avg_score_vals.append(best_avg_score)
            std_dev_score_vals.append(std_dev_score)
            replay_fill_size_vals.append(replay_fill_size)

        ep_score = 0

        max_score = -21
        min_score = 21

        print('avg_score_vals: {}'.format(avg_score_vals))
        print('avg_score_vals_size: {}'.format(len(avg_score_vals)))
        print('std_dev_score_vals: {}'.format(std_dev_score_vals))
        print('std_dev_score_vals size: {}'.format(len(std_dev_score_vals)))
        print('best_avg_score_vals: {}'.format(best_avg_score_vals))
        print('best_avg_score_vals size: {}'.format(len(best_avg_score_vals)))
        print('replay_fill_size_vals: {}'.format(replay_fill_size_vals))
        print('replay_fill_size_vals size: {}'.format(
            len(replay_fill_size_vals)))
        print('epsilon_vals: {}'.format(epsilon_vals))
        print('epsilon_vals size: {}'.format(len(epsilon_vals)))
        print('score_window: {}'.format(score_window))
        print('score_window size: {}'.format(len(score_window)))

        for t in range(t_start, TIMESTEPS):
            action = dqn_agent.choose_action(obs)
            dqn_agent.update_epsilon(ep, rt=run_type)
            new_obs, rew, done, _ = env.step(action)
            new_obs = new_obs.__array__(dtype=np.uint8)
            dqn_agent.remember(obs, action, rew, new_obs, done)
            obs = new_obs

            ep_score += rew
            ep_steps += 1

            if done:
                obs = env.reset().__array__(dtype=np.uint8)
                score_window.append(ep_score)

                ep += 1

                print('Episode {} | Timestep {} -> Score: {}'.format(
                    ep, t, ep_score))

                avg_score = round(np.mean(score_window), 1)
                if avg_score > best_avg_score and t > dqn_agent.learn_start and ep % args.save_frequency == 0:
                    best_avg_score = avg_score
                    dqn_agent.save_mdl()
                std_dev_score = round(np.std(score_window), 1)
                avg_score_vals.append(avg_score)
                epsilon_vals.append(dqn_agent.epsilon)
                best_avg_score_vals.append(best_avg_score)
                std_dev_score_vals.append(std_dev_score)
                replay_fill_size_vals.append(
                    dqn_agent.replay_buffer.meta_data['fill_size'])

                print('Size of avg_score_vals buffer: {}'.format(
                    len(avg_score_vals)))

                if ep % args.log_frequency == 0:
                    print('Avg score: {}'.format(avg_score))
                    print('Time spent exploring: {} %'.format(
                        round(
                            100 * dqn_agent.exploration.low_damp_value(
                                ep,
                                wave_offset=args.wave_offset,
                                anneal_factor=args.anneal_factor * args.ep_lim,
                                damp_freq_factor=args.damp_freq_factor *
                                args.ep_lim), 2)))
                    print('Std dev of score: {}'.format(std_dev_score))
                    # print('Max score: {}'.format(max_score))
                    # print('Min score: {}'.format(min_score))
                    print('ReplayBuffer fill_size: {}'.format(
                        dqn_agent.replay_buffer.meta_data['fill_size']))
                    print('Score window contents: {}'.format(
                        np.array(score_window)))

                if ep > 0 and ep % args.plot_frequency == 0:

                    x_vals = np.arange(ep + 1)
                    draw_plot(avg_score_plt,
                              ax1,
                              x_vals,
                              avg_score_vals,
                              'Episodes',
                              'Avg Score',
                              plot_name='Episodes vs Avg Score',
                              plot_path=seed_path,
                              rt=run_type)
                    draw_plot(epsilon_vals_plt,
                              ax2,
                              x_vals,
                              epsilon_vals,
                              'Episodes',
                              'Epsilon',
                              plot_name='Episodes vs Epsilon',
                              plot_path=seed_path,
                              rt=run_type)
                    draw_plot(std_dev_plt,
                              ax3,
                              x_vals,
                              std_dev_score_vals,
                              'Episodes',
                              'Std Dev of Scores',
                              plot_name='Episodes vs Std dev of score',
                              plot_path=seed_path,
                              rt=run_type)
                    draw_plot(replay_fill_plt,
                              ax4,
                              x_vals,
                              replay_fill_size_vals,
                              'Episodes',
                              'Replay buffer Fill Size',
                              plot_name='Episodes vs Replay Buffer Fill size',
                              plot_path=seed_path,
                              rt=run_type)
                    draw_plot(best_avg_score_plt,
                              ax5,
                              x_vals,
                              best_avg_score_vals,
                              'Episodes',
                              'Best Avg Score',
                              plot_name='Episodes vs Best Avg Score',
                              plot_path=seed_path,
                              rt=run_type)

                if ep % args.save_frequency == 0:
                    plot_data = {
                        'avg_score_vals': avg_score_vals,
                        'epsilon_vals': epsilon_vals,
                        'best_avg_score_vals': best_avg_score_vals,
                        'std_dev_score_vals': std_dev_score_vals,
                        'replay_fill_size_vals': replay_fill_size_vals,
                        'score_window': score_window,
                    }

                    save_plot_data(plt_data=plot_data, run_path=run_path)
                    state_data = {
                        'ep': ep,
                        't': t,
                        'ep_steps': ep_steps,
                        'score': ep_score,
                        'avg_score': avg_score,
                        'std_dev_score': std_dev_score,
                        'replay_fill_size': replay_fill_size,
                        'best_avg_score': best_avg_score,
                        'epsilon': dqn_agent.epsilon
                    }
                    save_state(st_data=state_data, run_path=run_path)
                    dqn_agent.save()

                ep_score = 0
                ep_steps = 0

            if ep + 1 == EPISODES:  # or best_avg_score >= args.target_score:
                print(
                    '---------------------------just before finish-----------------------------------------'
                )
                x_vals = np.arange(ep + 1)
                draw_plot(avg_score_plt,
                          ax1,
                          x_vals,
                          avg_score_vals,
                          'Episodes',
                          'Avg Score',
                          plot_name='Episodes vs Avg Score',
                          plot_path=seed_path,
                          rt=run_type,
                          legend=True)
                draw_plot(epsilon_vals_plt,
                          ax2,
                          x_vals,
                          epsilon_vals,
                          'Episodes',
                          'Epsilon',
                          plot_name='Episodes vs Epsilon',
                          plot_path=seed_path,
                          rt=run_type,
                          legend=True)
                draw_plot(std_dev_plt,
                          ax3,
                          x_vals,
                          std_dev_score_vals,
                          'Episodes',
                          'Std Dev of Scores',
                          plot_name='Episodes vs Std dev of score',
                          plot_path=seed_path,
                          rt=run_type,
                          legend=True)
                draw_plot(replay_fill_plt,
                          ax4,
                          x_vals,
                          replay_fill_size_vals,
                          'Episodes',
                          'Replay buffer Fill Size',
                          plot_name='Episodes vs Replay Buffer Fill size',
                          plot_path=seed_path,
                          rt=run_type,
                          legend=True)
                draw_plot(best_avg_score_plt,
                          ax5,
                          x_vals,
                          best_avg_score_vals,
                          'Episodes',
                          'Best Avg Score',
                          plot_name='Episodes vs Best Avg Score',
                          plot_path=seed_path,
                          rt=run_type,
                          legend=True)

                break

            if t > dqn_agent.learn_start:
                if t % dqn_agent.train_freq == 0:
                    dqn_agent.replay()
                if t % dqn_agent.train_targets == 0:
                    dqn_agent.train_target_network()
コード例 #26
0
def bMarioDead(currentLifeCount):
    global lifeNum
    if (lifeNum != currentLifeCount):
        return True
    else:
        return False


state = env.reset()

# =========== MAIN CODE =========================

#use same seed to see same outcomes
SEED = 1337
prng.seed(SEED)
random.seed(SEED)

# FIRST STEP OCCURED REGARDLESS -----ROOT------
# root = Node()
# #randomAction = env.action_space.sample()
# state, reward, done, info = env.step(0)
# print(info)
# #print(randomAction)
# #save the inital life number
# lifeNum = info["life"]
# currentChild = root.returnChild(0, info["x_pos"], bMarioDead(info["life"]))
# env.render()

lifeNum = 3
currentChild = Node(None, None, False, 0)
コード例 #27
0
def env_thread(args, thread_num, partition=True, use_ppo2=False):
    """
    Run a session of an environment
    :param args: (ArgumentParser object)
    :param thread_num: (int) The thread ID of the environment session
    :param partition: (bool) If the output should be in multiple parts (default=True)
    :param use_ppo2: (bool) Use ppo2 to generate the dataset
    """
    env_kwargs = {
        "max_distance": args.max_distance,
        "random_target": args.random_target,
        "force_down": True,
        "is_discrete": not args.continuous_actions,
        "renders": thread_num == 0 and args.display,
        "record_data": not args.no_record_data,
        "multi_view": args.multi_view,
        "save_path": args.save_path,
        "shape_reward": args.shape_reward
    }

    if partition:
        env_kwargs["name"] = args.name + "_part-" + str(thread_num)
    else:
        env_kwargs["name"] = args.name

    env_class = registered_env[args.env][0]
    env = env_class(**env_kwargs)

    # Additional env when using a trained ppo agent to generate data
    # instead of a random agent
    train_env = env_class(**{**env_kwargs, "record_data": False, "renders": False})
    train_env = DummyVecEnv([lambda: train_env])
    train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False)

    model = None
    if use_ppo2:
        model = PPO2(CnnPolicy, train_env).learn(args.ppo2_timesteps)

    frames = 0
    start_time = time.time()
    # divide evenly, then do an extra one for only some of them in order to get the right count
    for i_episode in range(args.num_episode // args.num_cpu + 1 * (args.num_episode % args.num_cpu > thread_num)):
        # seed + position in this slice + size of slice (with reminder if uneven partitions)
        seed = args.seed + i_episode + args.num_episode // args.num_cpu * thread_num + \
               (thread_num if thread_num <= args.num_episode % args.num_cpu else args.num_episode % args.num_cpu)

        env.seed(seed)
        prng.seed(seed)  # this is for the sample() function from gym.space
        obs = env.reset()
        done = False
        t = 0
        while not done:
            env.render()

            if use_ppo2:
                action, _ = model.predict([obs])
            else:
                action = [env.action_space.sample()]

            _, _, done, _ = env.step(action[0])
            frames += 1
            t += 1
            if done:
                print("Episode finished after {} timesteps".format(t + 1))

        if thread_num == 0:
            print("{:.2f} FPS".format(frames * args.num_cpu / (time.time() - start_time)))
コード例 #28
0
 def seed(self, seed):
     prng.seed(seed)
     np.random.seed(seed)
     random.seed(seed)
コード例 #29
0
def run(seed_num, train=True, run_type='base'):
    env = gym.make(ENV_NAME)  #.env
    env._max_episode_steps = args.max_timesteps
    env.seed(seed_num)
    seed(seed_num)

    dqn_agent = DDQN(env)

    trial_seed_path = os.path.join(data_path, 'Seed ' + str(seed_num))
    if not os.path.exists(trial_seed_path):
        os.makedirs(trial_seed_path)

    if train:
        max_score = -1000
        min_score = 1000
        avg_score = -1000
        best_avg_score = -1000

        epsilon_vals = deque(maxlen=EPISODES)
        avg_score_vals = deque(maxlen=EPISODES)
        best_avg_score_vals = deque(maxlen=EPISODES)
        # max_score_vals = deque(maxlen=EPISODES)
        # min_score_vals = deque(maxlen=EPISODES)
        std_dev_score_vals = deque(maxlen=EPISODES)
        replay_fill_size_vals = deque(maxlen=EPISODES)
        score_window = deque(maxlen=100)

        for ep in range(EPISODES):
            curr_obs = env.reset()
            curr_obs = reshape_input(curr_obs)
            total_r = 0
            if ep % 10 == 0:
                render = False  #True
            epsilon_vals.append(dqn_agent.epsilon)
            while True:
                if render:
                    env.render()
                action = dqn_agent.choose_action(curr_obs)
                next_obs, reward, done, info = env.step(action)
                next_obs = reshape_input(next_obs)
                total_r += reward
                if total_r % 1000 == 0:
                    print('current reward for ep {}: reached {}'.format(
                        ep, total_r))
                dqn_agent.remember(curr_obs, action, reward, next_obs, done)
                curr_obs = next_obs
                dqn_agent.replay()
                dqn_agent.update_epsilon(ep, run_type)

                if done:
                    if render:
                        render = False
                        env.close()
                    if ep % dqn_agent.train_targets == 0:
                        dqn_agent.train_target_network()
                    break
            score_window.append(total_r)
            avg_score = np.mean(score_window)
            if avg_score > best_avg_score:
                best_avg_score = avg_score
                dqn_agent.save_mdl(trial_seed_path, run_type)
            std_dev_score = np.std(score_window)
            # max_score = max(total_r,max_score)
            # min_score = min(total_r,min_score)
            # max_score_vals.append(max_score)
            # min_score_vals.append(min_score)
            avg_score_vals.append(avg_score)
            best_avg_score_vals.append(best_avg_score)
            std_dev_score_vals.append(std_dev_score)
            replay_fill_size_vals.append(
                dqn_agent.replay_buffer.meta_data['fill_size'])

            print('Episode ', ep, ' -> Score: ', total_r)
            if ep % args.log_frequency == 0:
                print('Avg score: {}'.format(avg_score))
                print('Std dev of score: {}'.format(std_dev_score))
                print('Max score: {}'.format(max_score))
                print('Min score: {}'.format(min_score))
                print('ReplayBuffer fill_size: {}'.format(
                    dqn_agent.replay_buffer.meta_data['fill_size']))

            if ep > 0 and ep % args.plot_frequency == 0:
                if ep + 1 == EPISODES:
                    ax1.lines[-1].set_label(run_type)
                    ax2.lines[-1].set_label(run_type)
                    ax3.lines[-1].set_label(run_type)
                    ax4.lines[-1].set_label(run_type)
                    ax5.lines[-1].set_label(run_type)
                    ax1.legend()
                    ax2.legend()
                    ax3.legend()
                    ax4.legend()
                    ax5.legend()

                x_vals = np.arange(ep + 1)
                save_plot(avg_score_plt,
                          ax1,
                          x_vals,
                          avg_score_vals,
                          'Episodes',
                          'Avg Score',
                          plot_name='Episodes vs Avg Score',
                          trial_seed_path=trial_seed_path,
                          rt=run_type)
                save_plot(epsilon_vals_plt,
                          ax2,
                          x_vals,
                          epsilon_vals,
                          'Episodes',
                          'Epsilon',
                          plot_name='Episodes vs Epsilon',
                          trial_seed_path=trial_seed_path,
                          rt=run_type)
                save_plot(std_dev_plt,
                          ax3,
                          x_vals,
                          std_dev_score_vals,
                          'Episodes',
                          'Std Dev of Scores',
                          plot_name='Episodes vs Std dev of score',
                          trial_seed_path=trial_seed_path,
                          rt=run_type)
                save_plot(replay_fill_plt,
                          ax4,
                          x_vals,
                          replay_fill_size_vals,
                          'Episodes',
                          'Replay buffer Fill Size',
                          plot_name='Episodes vs Replay Buffer Fill size',
                          trial_seed_path=trial_seed_path,
                          rt=run_type)
                save_plot(best_avg_score_plt,
                          ax5,
                          x_vals,
                          best_avg_score_vals,
                          'Episodes',
                          'Best Avg Score',
                          plot_name='Episodes vs Best Avg Score',
                          trial_seed_path=trial_seed_path,
                          rt=run_type)

                save_plot_data(d=avg_score_vals,
                               trial_seed_path=trial_seed_path,
                               rt=run_type)
コード例 #30
0
from rlkit.envs.gridcraft import REW_ARENA_64
from rlkit.envs.gridcraft.grid_env import GridEnv
from rlkit.envs.gridcraft.grid_spec import *
from rlkit.envs.gridcraft.mazes import MAZE_ANY_START1
import gym.spaces.prng as prng
import numpy as np

if __name__ == "__main__":
    prng.seed(2)

    maze_spec = \
        spec_from_string("SOOOO#R#OO\\"+
                         "OSOOO#2##O\\" +
                         "###OO#3O#O\\" +
                         "OOOOO#OO#O\\" +
                         "OOOOOOOOOO\\"
                         )

    #maze_spec = spec_from_sparse_locations(50, 50, {START: [(25,25)], REWARD: [(45,45)]})
    # maze_spec = REW_ARENA_64
    maze_spec = MAZE_ANY_START1

    env = GridEnv(maze_spec, one_hot=True, add_eyes=True, coordinate_wise=True)

    s = env.reset()
    #env.render()

    obses = []
    for t in range(10):
        a = env.action_space.sample()
        obs, r, done, infos = env.step(a, verbose=True)