コード例 #1
0
def render(env, agent, name="", record=False):
    if record:
        env = Monitor(env,
                      './video-test/{}'.format(name),
                      force=True,
                      mode="evaluation")
    for i_episode in range(5):
        state = env.reset()
        total_reward = 0
        for step, _ in enumerate(range(STEPS), start=1):
            state = np.expand_dims(state, axis=0)
            env.render()

            action_index = agent.act(state)
            action = decode_action(action_index)

            next_state, reward, done, info = env.step(action)
            if done:
                break
            state = next_state
            total_reward += reward

        print("Episode achieves total reward {}".format(total_reward))
コード例 #2
0
ファイル: car_racing_v1.py プロジェクト: Hewiiitt/gym
        if k == key.LEFT and a[0] == -1.0: a[0] = 0
        if k == key.RIGHT and a[0] == +1.0: a[0] = 0
        if k == key.UP: a[1] = 0
        if k == key.DOWN: a[2] = 0

    env = CarRacingV1()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor
        env = Monitor(env, '/tmp/video-test', force=True)
    isopen = True
    while isopen:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False
        while True:
            s, r, done, info = env.step(a)
            total_reward += r
            if steps % 200 == 0 or done:
                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))
                #import matplotlib.pyplot as plt
                #plt.imshow(s)
                #plt.savefig("test.jpeg")
            steps += 1
            isopen = env.render()
コード例 #3
0
import gym
from gym.wrappers.monitor import Monitor
from challenge1 import get_model, get_policy

# 1. Learn the model f: s, a -> s', r
env = Monitor(gym.make('Pendulum-v0'),
              'training',
              video_callable=False,
              force=True)
env.seed(98251624)
max_num_samples = 10000
model = get_model(env, max_num_samples)
env.close()

# Your model will be tested on the quality of prediction
obs = env.reset()
act = env.action_space.sample()
nobs, rwd, _, _ = env.step(act)
nobs_pred, rwd_pred = model(obs, act)
print(f'truth = {nobs, rwd}\nmodel = {nobs_pred, rwd_pred}')
env.close()

# 2. Perform dynamic programming using the learned model
env = Monitor(gym.make('Pendulum-v0'), 'evaluation', force=True)
env.seed(31186490)
policy = get_policy(model, env.observation_space, env.action_space)

# Your policy will be judged based on the average episode return
n_eval_episodes = 100
for _ in range(n_eval_episodes):
    done = False
コード例 #4
0
class DQNAgent:
    def __init__(self, lr, momentum, alpha, gamma, target_update_frequency,
                 local_update_frequency, replay_start_size, queue_len,
                 batch_size):
        gym.logger.set_level(40)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = gym.make('LunarLander-v2')
        self.replay_buffer = ReplayBuffer(queue_len, self.device, alpha)

        self.local_qnetwork = DQNModel().to(self.device)
        self.target_qnetwork = DQNModel().to(self.device)
        self.target_qnetwork.load_state_dict(self.local_qnetwork.state_dict())
        self.optimizer = optim.RMSprop(self.local_qnetwork.parameters(),
                                       lr=lr,
                                       momentum=momentum)

        self.gamma = gamma
        self.target_update_frequency = target_update_frequency
        self.local_update_frequency = local_update_frequency
        self.replay_start_size = replay_start_size
        self.batch_size = batch_size

        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.episode_step = 0

    def agent_step(self, state, eps, beta):
        next_state, reward, done = self.env_step(state, eps)
        if len(self.replay_buffer.queue) < self.replay_start_size:
            return next_state, reward, None, done

        # Update the local q network every local_update_frequency steps
        loss = None
        if self.episode_step % self.local_update_frequency == 0:
            loss = self.qnetwork_step(beta)

        # Update the target q network every target_update_frequency steps
        if self.episode_step % self.target_update_frequency == 0:
            self.target_qnetwork.load_state_dict(
                self.local_qnetwork.state_dict())

        self.episode_step += 1
        return next_state, reward, loss, done

    def env_step(self, state, eps):
        action = self.policy(state, eps)
        next_state, reward, done, _ = self.env.step(action)

        self.replay_buffer.put([state, action, reward, next_state, done])
        return next_state, reward, done

    def qnetwork_step(self, beta):
        states, actions, rewards, next_states, dones, indices, is_weights = self.replay_buffer.batch_get(
            self.batch_size, self.state_size, beta)

        # Double DQN
        next_target_actions = torch.argmax(self.local_qnetwork(next_states),
                                           dim=1).unsqueeze(1)
        next_target_rewards = self.target_qnetwork(next_states).gather(
            1, next_target_actions)
        target_rewards = rewards + self.gamma * next_target_rewards * (1 -
                                                                       dones)
        local_rewards = self.local_qnetwork(states).gather(1, actions.long())

        self.optimizer.zero_grad()
        td_error = (local_rewards - target_rewards.detach())**2
        loss = torch.mean(is_weights.unsqueeze(1) * td_error)
        loss.backward()
        for param in self.local_qnetwork.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.replay_buffer.update_priorities(indices,
                                             td_error.data.cpu() + 0.0001)
        return loss.item()

    def policy(self, state, eps):
        if random.random() < eps:
            # Random action
            return self.env.action_space.sample()
        else:
            # Act according to local q network
            self.local_qnetwork.eval()
            with torch.no_grad():
                out = self.local_qnetwork(
                    torch.FloatTensor(state).to(
                        self.device).unsqueeze(0)).cpu()
            self.local_qnetwork.train()

            return torch.argmax(out).item()

    def reset(self, record):
        self.episode_step = 0

        if record:
            self.env = Monitor(gym.make('LunarLander-v2'),
                               "recordings",
                               video_callable=lambda episode_id: True,
                               force=True)
        else:
            self.env = gym.make('LunarLander-v2')

        return self.env.reset()
コード例 #5
0
import gym
from gym.wrappers.monitor import Monitor
from tf_rl.env.pybullet.env_list import ENVS

for key, env_name in ENVS.items():
    print(env_name)
    env = gym.make(env_name)
    env = Monitor(env=env, directory="./video/{}".format(key), force=True)

    state = env.reset()
    for t in range(100):
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        if done:
            break

    env.close()
コード例 #6
0
def main():
    toRender = {
        "line": 1,
        "circle": 1,
        "parabola": 0,
        "cycloid": 1,
        "random": 1,
        "rl": 0
    }

    if (len(sys.argv) == 2):
        #read actions from file
        global env4list
        #toRender["rl"] = 1
        #fin = open(sys.argv[1],"r")
        #line = fin.readline()
        env4list = np.load(sys.argv[1])
        env4list = smooth(env4list)
        toRender["rl"] = 1

        #fin.close()

    global gViewer
    gViewer = rendering.Viewer(600, 600)
    saveVideo = True

    global env0, env0theta, env0done
    if toRender["random"]:
        env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0))
        if saveVideo:
            from gym.wrappers.monitor import Monitor
            env0 = Monitor(env0, './video-test', force=True)

        env0.reset()
        env0theta = 0
        env0done = False
        env0.score_label.x = gViewer.width - 150
        env0.score_label.y = gViewer.height - 10
    if toRender["line"]:
        global env1, env1theta, env1done
        env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0))
        if toRender["random"]:
            env1.setStartPosition(env0.start_position)
        env1done = False
        env1theta = math.atan(
            (env1.goal_position[1] - env1.start_position[1]) /
            (env1.goal_position[0] - env1.start_position[0])) / (math.pi)
        env1.reset()
        env1.score_label.x = gViewer.width - 150
        env1.score_label.y = gViewer.height - 25

    if toRender["circle"]:
        global env2, env2theta, env2done
        env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1))
        if toRender["random"]:
            env2.setStartPosition(env0.start_position)
        env2done = False
        env2theta = 2 * math.atan(
            (env2.goal_position[1] - env2.start_position[1]) /
            (env2.goal_position[0] - env2.start_position[0])) / (math.pi)
        env2.reset()
        env2.score_label.x = gViewer.width - 150
        env2.score_label.y = gViewer.height - 40

    if toRender["cycloid"]:
        global env3, env3theta, env3done, R_cycloid, T_Cycloid
        env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25))
        if toRender["random"]:
            env3.setStartPosition(env0.start_position)
        R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position,
                                                env3.goal_position)
        env3theta = 2 * math.atan(
            (env3.goal_position[1] - env3.start_position[1]) /
            (env3.goal_position[0] - env3.start_position[0])) / (math.pi)
        env3done = False
        env3.reset()
        env3.score_label.x = gViewer.width - 150
        env3.score_label.y = gViewer.height - 55
    if toRender["rl"]:
        global env4, env4theta, env4done
        env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0))
        env4.reset()
        env4theta = 0
        env4done = False
        env4.score_label.x = gViewer.width - 150
        env4.score_label.y = gViewer.height - 70

    numsteps = 1000
    for i in range(numsteps):

        toRender["random"] and env0.render()
        toRender["line"] and env1.render()
        toRender["circle"] and env2.render()
        toRender["cycloid"] and env3.render()
        toRender["rl"] and env4.render()

        if toRender["random"] and not env0done:
            env0theta = env0.action_space.sample()
            _, _, env0done, _ = env0.step(np.float32(env0theta))
        if toRender["line"] and not env1done:
            _, _, env1done, _ = env1.step(np.float32([env1theta]))
        if toRender["circle"] and not env2done:
            _, _, env2done, _ = env2.step(np.float32([env2theta]))
            env2theta = 2 * math.atan(
                (env2.goal_position[1] - env2.state[1]) /
                (env2.goal_position[0] - env2.state[0])) / math.pi
        if toRender["cycloid"] and not env3done:
            _, _, env3done, _ = env3.step(np.float32([env3theta]))
            env3theta = solveCycloid(env3.start_position,
                                     [env3.state[0], env3.state[1]])
        """
        if toRender["rl"] and not env5done:
            line = fin.readline()
            if line:
                env0theta = [float(line)]
                _,_,env0done,_ = env5.step(np.float32([env5theta]))
            else:
                env0done = True
        """
        if toRender["rl"] and not env4done:
            if i >= len(env4list):
                continue
            env4theta = env4list[i]
            _, _, env4done, _ = env4.step(np.float32([env4theta]))

    toRender["random"] and env0.close()
    toRender["line"] and env1.close()
    toRender["circle"] and env2.close()
    toRender["cycloid"] and env3.close()
    if toRender["rl"]:
        pts = env4.path
        print(pts)
        coeffs = polyfit(pts)
        env4.close()
    return
コード例 #7
0
ファイル: play.py プロジェクト: Tubbz-alt/atari-world-models
    def __call__(self, step_limit, solution=None, stamp=None, record=False):
        logger.info("Playing game %s with step_limit %d", self.game, step_limit)

        with torch.no_grad():
            controller = Controller(self.game, self.models_dir)
            if solution is not None:
                controller.load_solution(solution)
            else:
                controller.load_state(stamp)

            vae = VAE(self.game, self.models_dir)
            vae.load_state()

            mdn_rnn = MDN_RNN(self.game, self.models_dir)
            mdn_rnn.load_state()

            env = gym.make(self.game.key)
            if self.game.wrapper is not None:
                env = self.game.wrapper(env)
            if record:
                env = Monitor(env, "monitor", force=True)

            action = torch.zeros(self.game.action_vector_size)

            screen = env.reset()
            screen = transform(screen)
            screen.unsqueeze_(0)

            z, _, _ = vae.encoder(screen)
            _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0))

            # h = torch.tensor([[[0] * 256]], dtype=torch.float32)

            overall_reward = 0
            steps = 0

            while True:
                env.render()
                action = controller(z.squeeze(0).squeeze(0), h.squeeze(0).squeeze(0))

                actual_action = self.game.transform_action(action.detach().numpy())
                screen, reward, done, _ = env.step(actual_action)

                overall_reward += reward
                screen = transform(screen)
                screen.unsqueeze_(0)

                z, _, _ = vae.encoder(screen)
                _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0))

                if done or (step_limit and steps >= step_limit):
                    if done:
                        logger.info("Game reached done")
                    else:
                        logger.info("Step limit reached")

                    break

                steps += 1
            env.close()

            # Transform reward to be useful to CMA-ES
            overall_reward = self.game.transform_overall_reward(overall_reward)

            logger.info("Game %s finished with reward %d", self.game.key, overall_reward)

        return overall_reward
コード例 #8
0
from Controller import Controller
from car_racing import CarRacing
from gym.wrappers.monitor import Monitor

C = Controller()

for weights in [BEST_CONTROLLER_WEIGHTS, OPTIMAL_CONTROLLER_WEIGHTS]:
    ENV = Monitor(CarRacing(), f'{weights[:-5]}_SIM', force=True)

    try:
        C.load_parameters(weights)
    except:
        raise Exception('Train the Controller first.')

    done = False
    steps = 0

    observation = ENV.reset()
    reward_FULL = 0

    while not done and steps < MAX_STEPS:
        ENV.render()

        action = C.get_action(observation)
        observation, reward, done, _ = ENV.step(action)

        reward_FULL += reward
        steps += 1

    ENV.close()
    print(f'{weights} Reward: {reward_FULL}')
コード例 #9
0
import gym
from gym.wrappers.monitor import Monitor
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("Assault-v0")
env = Monitor(env, "videos", force=True)

# test
img = env.reset()  # Donne des images en (250, 160, 3)

### Test du modèle
from model import Model
model = Model(num_actions=env.action_space.n)
action, value = model.action_value(obs=img)

# Creation of the Agent


class Agent():
    def __init__(self, model):

        self.params = {"value": 1 / 7, "entropy": 0.0001, "gamma": 0.75}

        self.model = model

        self.model.compile(tf.keras.optimizers.Adam(lr=0.0001),
                           loss=[self._logits_loss, self._value_loss])

    ### Bof compris l'idée d'advantages
コード例 #10
0
def test(args,
         worker_id: int,
         global_model: torch.nn.Module,
         T: Value,
         global_reward: Value = None,
         optimizer: torch.optim.Optimizer = None,
         global_model_critic: CriticNetwork = None,
         optimizer_critic: torch.optim.Optimizer = None):
    """
    Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance
    loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py
    :param args: console arguments
    :param worker_id: id of worker to differentiatethem and init different seeds
    :param global_model: global model, which is optimized/ for split models: actor
    :param T: global counter of steps
    :param global_reward: global running reward value
    :param optimizer: optimizer for shared model/ for split models: actor model
    :param global_model_critic: optional global critic model for split networks
    :param optimizer_critic: optional critic optimizer for split networks
    :return: None
    """

    logging.info("test worker started.")
    torch.manual_seed(args.seed + worker_id)

    if "RR" in args.env_name:
        env = quanser_robots.GentlyTerminating(gym.make(args.env_name))
    else:
        if args.monitor:
            env = Monitor(gym.make(args.env_name),
                          '100_test_runs',
                          video_callable=lambda count: count % 100 == 0,
                          force=True)
        else:
            env = gym.make(args.env_name)

    env.seed(args.seed + worker_id)

    normalizer = get_normalizer(args.normalizer, env)

    # get an instance of the current global model state
    model = copy.deepcopy(global_model)
    model.eval()

    model_critic = None
    if global_model_critic:
        model_critic = copy.deepcopy(global_model_critic)
        model_critic.eval()

    state = torch.from_numpy(env.reset())

    writer = SummaryWriter(comment='_test', log_dir='experiments/runs/')
    start_time = time.time()

    t = 0
    episode_reward = 0

    done = False
    global_iter = 0
    best_global_reward = -np.inf
    best_test_reward = -np.inf

    while True:

        # Get params from shared global model
        model.load_state_dict(global_model.state_dict())
        if not args.shared_model:
            model_critic.load_state_dict(global_model_critic.state_dict())

        rewards = []
        eps_len = []

        sleep = True

        # make 10 runs to get current avg performance
        for i in range(args.test_runs):
            while not done:
                t += 1

                if not args.no_render:
                    if i == 0 and t % 1 == 0 and "RR" not in args.env_name:
                        env.render()
                        if args.monitor and sleep:  # add a small delay to do a screen capture of the test run if needed
                            time.sleep(1)
                            sleep = False

                # apply min/max scaling on the environment

                with torch.no_grad():

                    # select mean of normal dist as action --> Expectation
                    if args.shared_model:
                        _, mu, _ = model(normalizer(state))
                    else:
                        mu, _ = model(normalizer(state))

                    action = mu.detach()

                state, reward, done, _ = env.step(
                    np.clip(action.numpy(), -args.max_action, args.max_action))

                done = done or t >= args.max_episode_length
                episode_reward += reward

                if done:
                    # reset current cumulated reward and episode counter as well as env
                    rewards.append(episode_reward)
                    episode_reward = 0

                    eps_len.append(t)
                    t = 0

                    state = env.reset()

                state = torch.from_numpy(state)

            # necessary to make more than one run
            done = False

        time_print = time.strftime("%Hh %Mm %Ss",
                                   time.gmtime(time.time() - start_time))

        std_reward = np.std(rewards)
        rewards = np.mean(rewards)

        new_best = rewards > best_test_reward
        writer.add_scalar("reward/test", rewards, int(T.value))
        writer.add_scalar("episode/length", np.mean(eps_len), int(T.value))

        log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \
            f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \
            f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}"

        if new_best:
            # highlight messages if progress was done
            logging.info(log_string)

            best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward
            best_test_reward = rewards if rewards > best_test_reward else best_test_reward
            model_type = 'shared' if args.shared_model else 'split'

            save_checkpoint(
                {
                    'epoch':
                    T.value,
                    'model':
                    model.state_dict(),
                    'model_critic':
                    model_critic.state_dict()
                    if model_critic is not None else None,
                    'global_reward':
                    global_reward.value,
                    # only save optimizers if shared ones are used
                    'optimizer':
                    optimizer.state_dict() if optimizer else None,
                    'optimizer_critic':
                    optimizer_critic.state_dict()
                    if optimizer_critic else None,
                },
                path=
                f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar"
            )
        else:
            # use by default only debug messages if no progress was reached
            logging.debug(log_string)

        global_iter += 1

        # run evaluation only once in test mode
        if args.test:
            break
コード例 #11
0
import gym
import numpy as np
from gym.wrappers.monitor import Monitor
import fetch_block_construction
env = gym.make(
    'FetchBlockConstruction_2Blocks_SparseReward_DictstateObs_42Rendersize_FalseStackonly_SingletowerCase-v1'
)
env = Monitor(env, directory="videos", force=True, video_callable=lambda x: x)

env.env._max_episode_steps = 50
# env.env.seed(0)

env.reset()
env.env.stack_only = True

step = 0
while True:
    obs, done = env.reset(), False
    while not done:
        # env.render()
        action = np.asarray([0, 0, 0, 0])
        step_results = env.step(action)
        obs, reward, done, info = step_results
        print("Reward: {} Info: {}".format(reward, info))
        if done:
            step = 0
        step += 1
        print(step)