Esempio n. 1
0
def render(env, agent, name="", record=False):
    if record:
        env = Monitor(env,
                      './video-test/{}'.format(name),
                      force=True,
                      mode="evaluation")
    for i_episode in range(5):
        state = env.reset()
        total_reward = 0
        for step, _ in enumerate(range(STEPS), start=1):
            state = np.expand_dims(state, axis=0)
            env.render()

            action_index = agent.act(state)
            action = decode_action(action_index)

            next_state, reward, done, info = env.step(action)
            if done:
                break
            state = next_state
            total_reward += reward

        print("Episode achieves total reward {}".format(total_reward))
Esempio n. 2
0
        if k == key.DOWN: a[2] = 0

    env = CarRacingV1()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor
        env = Monitor(env, '/tmp/video-test', force=True)
    isopen = True
    while isopen:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False
        while True:
            s, r, done, info = env.step(a)
            total_reward += r
            if steps % 200 == 0 or done:
                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))
                #import matplotlib.pyplot as plt
                #plt.imshow(s)
                #plt.savefig("test.jpeg")
            steps += 1
            isopen = env.render()
            if done or restart or isopen == False: break
    env.close()
Esempio n. 3
0
def main():
    toRender = {
        "line": 1,
        "circle": 1,
        "parabola": 0,
        "cycloid": 1,
        "random": 1,
        "rl": 0
    }

    if (len(sys.argv) == 2):
        #read actions from file
        global env4list
        #toRender["rl"] = 1
        #fin = open(sys.argv[1],"r")
        #line = fin.readline()
        env4list = np.load(sys.argv[1])
        env4list = smooth(env4list)
        toRender["rl"] = 1

        #fin.close()

    global gViewer
    gViewer = rendering.Viewer(600, 600)
    saveVideo = True

    global env0, env0theta, env0done
    if toRender["random"]:
        env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0))
        if saveVideo:
            from gym.wrappers.monitor import Monitor
            env0 = Monitor(env0, './video-test', force=True)

        env0.reset()
        env0theta = 0
        env0done = False
        env0.score_label.x = gViewer.width - 150
        env0.score_label.y = gViewer.height - 10
    if toRender["line"]:
        global env1, env1theta, env1done
        env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0))
        if toRender["random"]:
            env1.setStartPosition(env0.start_position)
        env1done = False
        env1theta = math.atan(
            (env1.goal_position[1] - env1.start_position[1]) /
            (env1.goal_position[0] - env1.start_position[0])) / (math.pi)
        env1.reset()
        env1.score_label.x = gViewer.width - 150
        env1.score_label.y = gViewer.height - 25

    if toRender["circle"]:
        global env2, env2theta, env2done
        env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1))
        if toRender["random"]:
            env2.setStartPosition(env0.start_position)
        env2done = False
        env2theta = 2 * math.atan(
            (env2.goal_position[1] - env2.start_position[1]) /
            (env2.goal_position[0] - env2.start_position[0])) / (math.pi)
        env2.reset()
        env2.score_label.x = gViewer.width - 150
        env2.score_label.y = gViewer.height - 40

    if toRender["cycloid"]:
        global env3, env3theta, env3done, R_cycloid, T_Cycloid
        env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25))
        if toRender["random"]:
            env3.setStartPosition(env0.start_position)
        R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position,
                                                env3.goal_position)
        env3theta = 2 * math.atan(
            (env3.goal_position[1] - env3.start_position[1]) /
            (env3.goal_position[0] - env3.start_position[0])) / (math.pi)
        env3done = False
        env3.reset()
        env3.score_label.x = gViewer.width - 150
        env3.score_label.y = gViewer.height - 55
    if toRender["rl"]:
        global env4, env4theta, env4done
        env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0))
        env4.reset()
        env4theta = 0
        env4done = False
        env4.score_label.x = gViewer.width - 150
        env4.score_label.y = gViewer.height - 70

    numsteps = 1000
    for i in range(numsteps):

        toRender["random"] and env0.render()
        toRender["line"] and env1.render()
        toRender["circle"] and env2.render()
        toRender["cycloid"] and env3.render()
        toRender["rl"] and env4.render()

        if toRender["random"] and not env0done:
            env0theta = env0.action_space.sample()
            _, _, env0done, _ = env0.step(np.float32(env0theta))
        if toRender["line"] and not env1done:
            _, _, env1done, _ = env1.step(np.float32([env1theta]))
        if toRender["circle"] and not env2done:
            _, _, env2done, _ = env2.step(np.float32([env2theta]))
            env2theta = 2 * math.atan(
                (env2.goal_position[1] - env2.state[1]) /
                (env2.goal_position[0] - env2.state[0])) / math.pi
        if toRender["cycloid"] and not env3done:
            _, _, env3done, _ = env3.step(np.float32([env3theta]))
            env3theta = solveCycloid(env3.start_position,
                                     [env3.state[0], env3.state[1]])
        """
        if toRender["rl"] and not env5done:
            line = fin.readline()
            if line:
                env0theta = [float(line)]
                _,_,env0done,_ = env5.step(np.float32([env5theta]))
            else:
                env0done = True
        """
        if toRender["rl"] and not env4done:
            if i >= len(env4list):
                continue
            env4theta = env4list[i]
            _, _, env4done, _ = env4.step(np.float32([env4theta]))

    toRender["random"] and env0.close()
    toRender["line"] and env1.close()
    toRender["circle"] and env2.close()
    toRender["cycloid"] and env3.close()
    if toRender["rl"]:
        pts = env4.path
        print(pts)
        coeffs = polyfit(pts)
        env4.close()
    return
Esempio n. 4
0
    def __call__(self, step_limit, solution=None, stamp=None, record=False):
        logger.info("Playing game %s with step_limit %d", self.game, step_limit)

        with torch.no_grad():
            controller = Controller(self.game, self.models_dir)
            if solution is not None:
                controller.load_solution(solution)
            else:
                controller.load_state(stamp)

            vae = VAE(self.game, self.models_dir)
            vae.load_state()

            mdn_rnn = MDN_RNN(self.game, self.models_dir)
            mdn_rnn.load_state()

            env = gym.make(self.game.key)
            if self.game.wrapper is not None:
                env = self.game.wrapper(env)
            if record:
                env = Monitor(env, "monitor", force=True)

            action = torch.zeros(self.game.action_vector_size)

            screen = env.reset()
            screen = transform(screen)
            screen.unsqueeze_(0)

            z, _, _ = vae.encoder(screen)
            _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0))

            # h = torch.tensor([[[0] * 256]], dtype=torch.float32)

            overall_reward = 0
            steps = 0

            while True:
                env.render()
                action = controller(z.squeeze(0).squeeze(0), h.squeeze(0).squeeze(0))

                actual_action = self.game.transform_action(action.detach().numpy())
                screen, reward, done, _ = env.step(actual_action)

                overall_reward += reward
                screen = transform(screen)
                screen.unsqueeze_(0)

                z, _, _ = vae.encoder(screen)
                _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0))

                if done or (step_limit and steps >= step_limit):
                    if done:
                        logger.info("Game reached done")
                    else:
                        logger.info("Step limit reached")

                    break

                steps += 1
            env.close()

            # Transform reward to be useful to CMA-ES
            overall_reward = self.game.transform_overall_reward(overall_reward)

            logger.info("Game %s finished with reward %d", self.game.key, overall_reward)

        return overall_reward
Esempio n. 5
0
from Controller import Controller
from car_racing import CarRacing
from gym.wrappers.monitor import Monitor

C = Controller()

for weights in [BEST_CONTROLLER_WEIGHTS, OPTIMAL_CONTROLLER_WEIGHTS]:
    ENV = Monitor(CarRacing(), f'{weights[:-5]}_SIM', force=True)

    try:
        C.load_parameters(weights)
    except:
        raise Exception('Train the Controller first.')

    done = False
    steps = 0

    observation = ENV.reset()
    reward_FULL = 0

    while not done and steps < MAX_STEPS:
        ENV.render()

        action = C.get_action(observation)
        observation, reward, done, _ = ENV.step(action)

        reward_FULL += reward
        steps += 1

    ENV.close()
    print(f'{weights} Reward: {reward_FULL}')
Esempio n. 6
0
def test(args,
         worker_id: int,
         global_model: torch.nn.Module,
         T: Value,
         global_reward: Value = None,
         optimizer: torch.optim.Optimizer = None,
         global_model_critic: CriticNetwork = None,
         optimizer_critic: torch.optim.Optimizer = None):
    """
    Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance
    loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py
    :param args: console arguments
    :param worker_id: id of worker to differentiatethem and init different seeds
    :param global_model: global model, which is optimized/ for split models: actor
    :param T: global counter of steps
    :param global_reward: global running reward value
    :param optimizer: optimizer for shared model/ for split models: actor model
    :param global_model_critic: optional global critic model for split networks
    :param optimizer_critic: optional critic optimizer for split networks
    :return: None
    """

    logging.info("test worker started.")
    torch.manual_seed(args.seed + worker_id)

    if "RR" in args.env_name:
        env = quanser_robots.GentlyTerminating(gym.make(args.env_name))
    else:
        if args.monitor:
            env = Monitor(gym.make(args.env_name),
                          '100_test_runs',
                          video_callable=lambda count: count % 100 == 0,
                          force=True)
        else:
            env = gym.make(args.env_name)

    env.seed(args.seed + worker_id)

    normalizer = get_normalizer(args.normalizer, env)

    # get an instance of the current global model state
    model = copy.deepcopy(global_model)
    model.eval()

    model_critic = None
    if global_model_critic:
        model_critic = copy.deepcopy(global_model_critic)
        model_critic.eval()

    state = torch.from_numpy(env.reset())

    writer = SummaryWriter(comment='_test', log_dir='experiments/runs/')
    start_time = time.time()

    t = 0
    episode_reward = 0

    done = False
    global_iter = 0
    best_global_reward = -np.inf
    best_test_reward = -np.inf

    while True:

        # Get params from shared global model
        model.load_state_dict(global_model.state_dict())
        if not args.shared_model:
            model_critic.load_state_dict(global_model_critic.state_dict())

        rewards = []
        eps_len = []

        sleep = True

        # make 10 runs to get current avg performance
        for i in range(args.test_runs):
            while not done:
                t += 1

                if not args.no_render:
                    if i == 0 and t % 1 == 0 and "RR" not in args.env_name:
                        env.render()
                        if args.monitor and sleep:  # add a small delay to do a screen capture of the test run if needed
                            time.sleep(1)
                            sleep = False

                # apply min/max scaling on the environment

                with torch.no_grad():

                    # select mean of normal dist as action --> Expectation
                    if args.shared_model:
                        _, mu, _ = model(normalizer(state))
                    else:
                        mu, _ = model(normalizer(state))

                    action = mu.detach()

                state, reward, done, _ = env.step(
                    np.clip(action.numpy(), -args.max_action, args.max_action))

                done = done or t >= args.max_episode_length
                episode_reward += reward

                if done:
                    # reset current cumulated reward and episode counter as well as env
                    rewards.append(episode_reward)
                    episode_reward = 0

                    eps_len.append(t)
                    t = 0

                    state = env.reset()

                state = torch.from_numpy(state)

            # necessary to make more than one run
            done = False

        time_print = time.strftime("%Hh %Mm %Ss",
                                   time.gmtime(time.time() - start_time))

        std_reward = np.std(rewards)
        rewards = np.mean(rewards)

        new_best = rewards > best_test_reward
        writer.add_scalar("reward/test", rewards, int(T.value))
        writer.add_scalar("episode/length", np.mean(eps_len), int(T.value))

        log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \
            f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \
            f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}"

        if new_best:
            # highlight messages if progress was done
            logging.info(log_string)

            best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward
            best_test_reward = rewards if rewards > best_test_reward else best_test_reward
            model_type = 'shared' if args.shared_model else 'split'

            save_checkpoint(
                {
                    'epoch':
                    T.value,
                    'model':
                    model.state_dict(),
                    'model_critic':
                    model_critic.state_dict()
                    if model_critic is not None else None,
                    'global_reward':
                    global_reward.value,
                    # only save optimizers if shared ones are used
                    'optimizer':
                    optimizer.state_dict() if optimizer else None,
                    'optimizer_critic':
                    optimizer_critic.state_dict()
                    if optimizer_critic else None,
                },
                path=
                f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar"
            )
        else:
            # use by default only debug messages if no progress was reached
            logging.debug(log_string)

        global_iter += 1

        # run evaluation only once in test mode
        if args.test:
            break