Exemple #1
0
  def play_poison(self, n_step=10000, n_episode=1000, test_ep=None, render=False):
    print('play poison: ', self.poison)
    print('is_trian: ', self.is_train)
    print('+++++++++++++++++++++++++++++++++==')

    if test_ep == None:
      test_ep = self.ep_end

    test_history = History(self.config)

    
    if not self.display:
      gym_dir = '/tmp/%s-%s' % (self.env_name, get_time())
      #self.env.env.monitor.start(gym_dir)
      monitor = Monitor(self.env.env, directory = gym_dir)

    best_reward, best_idx = 0, 0
    total_reward = 0.

    for idx in xrange(n_episode):
      screen, reward, action, terminal = self.env.new_random_game()
      current_reward = 0

      for _ in range(self.history_length):
        test_history.add(screen)

      for t in tqdm(range(n_step), ncols=70):
        # 1. predict
        action = self.predict(test_history.get(), test_ep)
        # 2. act
        screen, reward, terminal = self.env.act(action, is_training=False)
        # 3. observe
        test_history.add(screen)

        # print('step: ', t, ' action: ', action, ' reward: ', reward)

        current_reward += reward
        if terminal:
          break

      if current_reward > best_reward:
        best_reward = current_reward
        best_idx = idx

      total_reward += current_reward

      print("="*30)
      print(" [%d] Best reward : %d" % (best_idx, best_reward))
      print("="*30)

    print('average reward is: ', total_reward/n_episode)
    if not self.display:
      monitor.close()
def do_run(run, dirname, args):
    """
    global snapshot
    snapshot2 = tracemalloc.take_snapshot()
    print(('MEMORY', run, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    if snapshot is not None:
        top_stats = snapshot2.compare_to(snapshot, 'lineno')
        print("[ Top 10 differences ]")
        for stat in top_stats[:10]:
            print(stat)
        print()
    snapshot = snapshot2
    """
    with tf.Graph().as_default():
        learner_assumptions = get_learner_assumption_kwargs(args)

        # Each run has a different random seed equal to the run id.
        np.random.seed(run)
        random.seed(run)

        is_gridworld = not 'lunar' in args.env_name.lower()

        # TODO: Reset test goal inside here? Or use environment instead?
        rollouts = [[]]
        # Initialize model with wrong transition model based on aristotle learner.
        rollouts[0] += make_rollouts(
            #policy=aristotle_pilot_policies[0],  # Was from a noisy policy.
            policy=policies.make_perfect_pilot_policy(
                goal=test_goal,
                act_labels=train_act_labels,
            ),
            env=test_env,
            n=args.n_initial_rollouts,
            task_idx=task_idx,
        )
        assert(len(rollouts[0]) == args.n_initial_rollouts)
        rollouts[0] += make_rollouts(
            #policy=aristotle_pilot_policies[0],  # Was from a noisy policy.
            policy=policies.make_perfect_pilot_policy(
                goal=test_goal,
                act_labels=train_act_labels,
            ),
            env=wrong_train_env,
            n=args.n_initial_wrong_rollouts,
            task_idx=task_idx,
        )

        model = None
        Q = None
        start_pos = None

        logs = []
        evals = []
        evals_unassisted = []
        learner_q_values = []
        with tf.Session() as sess:
            if needs_model:
                model = inverse_softq.InverseSoftQModel(
                    train_envs=[test_env]
                )

            # NOTE: Used to be inside episode loop!
            # TODO: Check if this broke anything!
            support_env = get_support_env(
                s=args.learner_support,
                model=model,
                sess=sess,
                goal=test_goal,
                test_act_labels=test_act_labels,
                n_act_dim=n_act_dim,
                threshold=args.bumper_threshold,
                q_bumper_boltzmann=args.q_bumper_boltzmann,
                q_bumper_version=args.q_bumper_version,
                q_bumper_target_r=args.q_bumper_target_r,
                q_bumper_length_normalized=args.q_bumper_length_normalized,
                q_bumper_logistic_upper_prob=args.q_bumper_logistic_upper_prob,
                q_bumper_alpha=args.q_bumper_alpha,
                q_threshold=args.q_threshold,
                test_env=test_env,
                env_name=args.env_name,
                start_pos=start_pos,
                trajectory_distance=args.trajectory_distance,
                dirname=dirname,
                p_override=args.p_override,
                undoing=args.undoing,
                p_suboptimal_override=args.p_suboptimal_override,
                override_next_best=args.override_next_best,
                optimal_agent_training_timesteps=args.optimal_agent_training_timesteps,
                optimal_agent_smoothing_timesteps=args.optimal_agent_smoothing_timesteps,
                gamma=args.gamma,
            )
            policy = get_learner_policy(
                s=args.learner_policy,
                #model=model,
                #sess=sess,
                #test_goal=test_goal,
                #train_act_labels=train_act_labels,
                #test_act_labels=test_act_labels,
                #n_act_dim=n_act_dim,
                #Q=Q,
                env=support_env,
                exploration_fraction=args.exploration_fraction,
                exploration_final_eps=args.exploration_final_eps,
                exploration_final_lr=args.exploration_final_lr,
                total_episodes=args.n_episodes,
                run=run,
            )


            for ep in range(args.n_episodes):
                #print('Rn: {} Ep: {}'.format(run, ep), flush=True)
                support_env_with_monitor = Monitor(
                    support_env,
                    directory=os.path.join(
                        dirname,
                        'assisted',
                        str(run).zfill(3),
                        str(ep).zfill(3),
                    ),
                    force=True,
                    video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False,
                    #video_callable=(lambda e: True) if is_gridworld else None,
                )
                # Simulate human learning
                """
                if args.learner_policy == 'q':
                    assert(args.n_learn_rollouts > 0)
                    Q = policies.q_learning(
                        rollouts if ep == 0 else [rollouts[0][-args.n_learn_rollouts:]],
                        n_obs_dim=n_obs_dim,
                        n_act_dim=n_act_dim,
                        user_action=args.think_all_actions_own,
                        Q_init=Q,
                        learning_rate=args.q_learning_rate,
                    )
                """

                _logs = None
                if needs_model:
                    _logs = inverse_softq.run_learning(
                        model=model,
                        sess=sess,
                        # train_tasks=train_aristotle_envs[:1],
                        rollouts=rollouts,
                        test_goal=test_goal,
                        test_act_labels=test_act_labels,
                        train_act_labels=train_act_labels,
                        n_iters=args.n_softq_train_iters,
                        train_frac=0.9,  # TODO: Change to 1
                        **learner_assumptions
                    )

                # Test
                #episode_seed = [run, ep]

                perf = compute_assisted_perf(
                    model=model,
                    sess=sess,
                    #test_act_labels=test_act_labels,
                    #train_act_labels=train_act_labels,
                    test_env=support_env_with_monitor,
                    policy=policy,
                    goal=test_goal,
                    #seed=episode_seed,
                    n_eval_rollouts=args.n_eval_rollouts,
                    policy_explore=True,
                    policy_update=True,
                    **learner_assumptions
                )

                unassisted_perf = None
                if args.n_eval_unassisted_rollouts is not None:
                    unassisted_support_env = get_support_env(
                        s='unassisted',
                        goal=test_goal,
                        test_act_labels=test_act_labels,
                        n_act_dim=n_act_dim,
                        test_env=test_env,
                        env_name=args.env_name,
                        start_pos=start_pos,
                        trajectory_distance=args.trajectory_distance,
                        dirname=dirname,
                    )
                    unassisted_support_env_with_monitor = Monitor(
                        unassisted_support_env,
                        directory=os.path.join(
                            dirname,
                            'unassisted',
                            str(run).zfill(3),
                            str(ep).zfill(3),
                        ),
                        force=True,
                        video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False,
                        #video_callable=(lambda e: True) if is_gridworld else None,
                    )
                    unassisted_perf = compute_assisted_perf(
                        model=model,
                        sess=sess,
                        #test_act_labels=test_act_labels,
                        #train_act_labels=train_act_labels,
                        test_env=unassisted_support_env_with_monitor,
                        policy=policy,
                        goal=test_goal,
                        #seed=episode_seed,
                        n_eval_rollouts=args.n_eval_unassisted_rollouts,
                        policy_explore=False,
                        policy_update=False,
                    )
                    unassisted_support_env_with_monitor.close()
                    unassisted_support_env.close()

                new_rollouts = perf['rollouts']
                rollouts[task_idx] += new_rollouts[:args.n_learn_rollouts]
                if _logs is not None:
                    logs.append(_logs)
                evals.append(perf)
                evals_unassisted.append(unassisted_perf)
                if args.learner_policy == 'q':
                    learner_q_values.append(copy(policy.Q))

                support_env_with_monitor.close()

        support_env.close()
        policy.close()

        out_d = {
                'logs': logs,
                'evals': evals,
                'evals_unassisted': (
                    evals_unassisted
                    if args.n_eval_unassisted_rollouts is not None
                    else None
                ),
                'q_values': learner_q_values,
                'args': vars(args),
                'run': run,
                'support_details': support_env.get_support_details(),
        }
        with open(
                os.path.join(dirname, 'data{}.json'.format(str(run).zfill(3))),
                'w',
        ) as f:
            json.dump(out_d, f, cls=NumpyEncoder)
Exemple #3
0
        if k == key.DOWN: a[2] = 0

    env = CarRacingV1()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor
        env = Monitor(env, '/tmp/video-test', force=True)
    isopen = True
    while isopen:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False
        while True:
            s, r, done, info = env.step(a)
            total_reward += r
            if steps % 200 == 0 or done:
                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))
                #import matplotlib.pyplot as plt
                #plt.imshow(s)
                #plt.savefig("test.jpeg")
            steps += 1
            isopen = env.render()
            if done or restart or isopen == False: break
    env.close()
Exemple #4
0
def main():
    toRender = {
        "line": 1,
        "circle": 1,
        "parabola": 0,
        "cycloid": 1,
        "random": 1,
        "rl": 0
    }

    if (len(sys.argv) == 2):
        #read actions from file
        global env4list
        #toRender["rl"] = 1
        #fin = open(sys.argv[1],"r")
        #line = fin.readline()
        env4list = np.load(sys.argv[1])
        env4list = smooth(env4list)
        toRender["rl"] = 1

        #fin.close()

    global gViewer
    gViewer = rendering.Viewer(600, 600)
    saveVideo = True

    global env0, env0theta, env0done
    if toRender["random"]:
        env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0))
        if saveVideo:
            from gym.wrappers.monitor import Monitor
            env0 = Monitor(env0, './video-test', force=True)

        env0.reset()
        env0theta = 0
        env0done = False
        env0.score_label.x = gViewer.width - 150
        env0.score_label.y = gViewer.height - 10
    if toRender["line"]:
        global env1, env1theta, env1done
        env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0))
        if toRender["random"]:
            env1.setStartPosition(env0.start_position)
        env1done = False
        env1theta = math.atan(
            (env1.goal_position[1] - env1.start_position[1]) /
            (env1.goal_position[0] - env1.start_position[0])) / (math.pi)
        env1.reset()
        env1.score_label.x = gViewer.width - 150
        env1.score_label.y = gViewer.height - 25

    if toRender["circle"]:
        global env2, env2theta, env2done
        env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1))
        if toRender["random"]:
            env2.setStartPosition(env0.start_position)
        env2done = False
        env2theta = 2 * math.atan(
            (env2.goal_position[1] - env2.start_position[1]) /
            (env2.goal_position[0] - env2.start_position[0])) / (math.pi)
        env2.reset()
        env2.score_label.x = gViewer.width - 150
        env2.score_label.y = gViewer.height - 40

    if toRender["cycloid"]:
        global env3, env3theta, env3done, R_cycloid, T_Cycloid
        env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25))
        if toRender["random"]:
            env3.setStartPosition(env0.start_position)
        R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position,
                                                env3.goal_position)
        env3theta = 2 * math.atan(
            (env3.goal_position[1] - env3.start_position[1]) /
            (env3.goal_position[0] - env3.start_position[0])) / (math.pi)
        env3done = False
        env3.reset()
        env3.score_label.x = gViewer.width - 150
        env3.score_label.y = gViewer.height - 55
    if toRender["rl"]:
        global env4, env4theta, env4done
        env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0))
        env4.reset()
        env4theta = 0
        env4done = False
        env4.score_label.x = gViewer.width - 150
        env4.score_label.y = gViewer.height - 70

    numsteps = 1000
    for i in range(numsteps):

        toRender["random"] and env0.render()
        toRender["line"] and env1.render()
        toRender["circle"] and env2.render()
        toRender["cycloid"] and env3.render()
        toRender["rl"] and env4.render()

        if toRender["random"] and not env0done:
            env0theta = env0.action_space.sample()
            _, _, env0done, _ = env0.step(np.float32(env0theta))
        if toRender["line"] and not env1done:
            _, _, env1done, _ = env1.step(np.float32([env1theta]))
        if toRender["circle"] and not env2done:
            _, _, env2done, _ = env2.step(np.float32([env2theta]))
            env2theta = 2 * math.atan(
                (env2.goal_position[1] - env2.state[1]) /
                (env2.goal_position[0] - env2.state[0])) / math.pi
        if toRender["cycloid"] and not env3done:
            _, _, env3done, _ = env3.step(np.float32([env3theta]))
            env3theta = solveCycloid(env3.start_position,
                                     [env3.state[0], env3.state[1]])
        """
        if toRender["rl"] and not env5done:
            line = fin.readline()
            if line:
                env0theta = [float(line)]
                _,_,env0done,_ = env5.step(np.float32([env5theta]))
            else:
                env0done = True
        """
        if toRender["rl"] and not env4done:
            if i >= len(env4list):
                continue
            env4theta = env4list[i]
            _, _, env4done, _ = env4.step(np.float32([env4theta]))

    toRender["random"] and env0.close()
    toRender["line"] and env1.close()
    toRender["circle"] and env2.close()
    toRender["cycloid"] and env3.close()
    if toRender["rl"]:
        pts = env4.path
        print(pts)
        coeffs = polyfit(pts)
        env4.close()
    return
Exemple #5
0
    def __call__(self, step_limit, solution=None, stamp=None, record=False):
        logger.info("Playing game %s with step_limit %d", self.game, step_limit)

        with torch.no_grad():
            controller = Controller(self.game, self.models_dir)
            if solution is not None:
                controller.load_solution(solution)
            else:
                controller.load_state(stamp)

            vae = VAE(self.game, self.models_dir)
            vae.load_state()

            mdn_rnn = MDN_RNN(self.game, self.models_dir)
            mdn_rnn.load_state()

            env = gym.make(self.game.key)
            if self.game.wrapper is not None:
                env = self.game.wrapper(env)
            if record:
                env = Monitor(env, "monitor", force=True)

            action = torch.zeros(self.game.action_vector_size)

            screen = env.reset()
            screen = transform(screen)
            screen.unsqueeze_(0)

            z, _, _ = vae.encoder(screen)
            _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0))

            # h = torch.tensor([[[0] * 256]], dtype=torch.float32)

            overall_reward = 0
            steps = 0

            while True:
                env.render()
                action = controller(z.squeeze(0).squeeze(0), h.squeeze(0).squeeze(0))

                actual_action = self.game.transform_action(action.detach().numpy())
                screen, reward, done, _ = env.step(actual_action)

                overall_reward += reward
                screen = transform(screen)
                screen.unsqueeze_(0)

                z, _, _ = vae.encoder(screen)
                _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0))

                if done or (step_limit and steps >= step_limit):
                    if done:
                        logger.info("Game reached done")
                    else:
                        logger.info("Step limit reached")

                    break

                steps += 1
            env.close()

            # Transform reward to be useful to CMA-ES
            overall_reward = self.game.transform_overall_reward(overall_reward)

            logger.info("Game %s finished with reward %d", self.game.key, overall_reward)

        return overall_reward
Exemple #6
0
from Controller import Controller
from car_racing import CarRacing
from gym.wrappers.monitor import Monitor

C = Controller()

for weights in [BEST_CONTROLLER_WEIGHTS, OPTIMAL_CONTROLLER_WEIGHTS]:
    ENV = Monitor(CarRacing(), f'{weights[:-5]}_SIM', force=True)

    try:
        C.load_parameters(weights)
    except:
        raise Exception('Train the Controller first.')

    done = False
    steps = 0

    observation = ENV.reset()
    reward_FULL = 0

    while not done and steps < MAX_STEPS:
        ENV.render()

        action = C.get_action(observation)
        observation, reward, done, _ = ENV.step(action)

        reward_FULL += reward
        steps += 1

    ENV.close()
    print(f'{weights} Reward: {reward_FULL}')
    '''
    Defining the simulation related constants
    '''
    NUM_EPISODES = 50000
    MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100
    STREAK_TO_END = 100
    SOLVED_T = np.prod(MAZE_SIZE, dtype=int)
    DEBUG_MODE = 0
    RENDER_MAZE = True
    ENABLE_RECORDING = True

    '''
    Creating a Q-Table for each state-action pair
    '''
    q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)

    '''
    Begin simulation
    '''
    recording_folder = "/tmp/maze_q_learning"

    if ENABLE_RECORDING:
        monitor = Monitor(env, recording_folder, force=True)
        # env.monitor.start(recording_folder, force=True)

    simulate()

    if ENABLE_RECORDING:
        monitor.close()
        # env.monitor.close()