Esempio n. 1
0
def _test_mux_rnn_policies(policy_dict, env_list, n_envs):
    env = hex_env.Hexapod(env_list, max_n_envs=n_envs)
    env.env_change_prob = 1
    env.max_steps = env.max_steps
    classifier = T.load(os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "data/classifier.p"),
                        map_location='cpu')

    # Test visually
    while True:
        current_idx = 0
        s = env.reset()
        h_p = None
        h_c = None
        episode_reward = 0
        with T.no_grad():
            for i in range(env.max_steps * 2):
                env_idx, h_c = classifier(
                    (my_utils.to_tensor(s, True).unsqueeze(0), h_c))
                env_idx = T.argmax(env_idx[0][0]).numpy()
                if env_idx != current_idx:
                    current_idx = env_idx
                    h_p = None
                    print("Changing policy to: {}".format(env_list[env_idx]))

                act, h_p = policy_dict[env_list[env_idx]](
                    (my_utils.to_tensor(s, True).unsqueeze(0), h_p))
                s, r, done, _ = env.step(act[0][0].numpy())
                episode_reward += r
                env.render()
                print("Env classification: {}".format(env_list[env_idx]))
        print("Episode reward: {}".format(episode_reward))
Esempio n. 2
0
def test_classifier_reactive_policies(policy_dict, env_list):
    env = hex_env.Hexapod(env_list)
    env.env_change_prob = 1
    env.max_steps = 600
    classifier = T.load("classifier_A.p", map_location='cpu')

    # Test visually
    while True:
        current_env = "flat"
        env_idx = np.random.randint(0, 3)
        rnd_idx = np.random.randint(0, 3)
        s = env.reset()
        h_c = None
        episode_reward = 0
        with T.no_grad():
            for i in range(env.max_steps * 2):
                env_idx, h_c = classifier(
                    (my_utils.to_tensor(s, True).unsqueeze(0), h_c))
                #print(env_idx)
                env_idx = T.argmax(env_idx[0][0]).numpy()
                if np.random.rand() < 0.01:
                    rnd_idx = np.random.randint(0, 3)

                act = policy_dict[env_list[env_idx]](my_utils.to_tensor(
                    s, True))
                s, r, done, _ = env.step(act[0].numpy())
                episode_reward += r
                env.render()
                print("Env classification: {}".format(env_list[env_idx]))
        print("Episode reward: {}".format(episode_reward))
Esempio n. 3
0
def test(env_list):
    env = hex_env.Hexapod(env_list)
    master = T.load("master_A.p", map_location='cpu')
    classifier = T.load("classifier_A.p", map_location='cpu')

    env.env_change_prob = 1.

    # Test visually
    while True:
        s = env.reset()
        h_m = None
        h_c = None
        episode_reward = 0
        with T.no_grad():
            for i in range(env.max_steps * 2):
                act, h_m = master((my_utils.to_tensor(s,
                                                      True).unsqueeze(0), h_m))
                c, h_c = classifier(
                    (my_utils.to_tensor(s, True).unsqueeze(0), h_c))
                s, r, done, _ = env.step(act[0][0].numpy())
                episode_reward += r
                env.render()
                print("Env classification: {}".format(env_list[T.argmax(
                    c[0][0]).numpy()]))
        print("Episode reward: {}".format(episode_reward))
Esempio n. 4
0
def _test_mux_reactive_policies(policy_dict, env_list, n_envs, ID='def'):
    import cv2

    def printval(values):
        img = np.zeros((90, 200, 3), dtype=np.uint8)
        a_idx = np.argmax(values)
        cv2.putText(img,
                    'p_{}'.format(env_list[0]) + '{0:.2f}'.format(values[0]),
                    (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                    (255 * int(a_idx != 0), 255, 0), 1, cv2.LINE_AA)
        cv2.putText(img,
                    'p_{}'.format(env_list[1]) + '{0:.2f}'.format(values[1]),
                    (10, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                    (255 * int(a_idx != 1), 255, 0), 1, cv2.LINE_AA)
        cv2.putText(img,
                    'p_{}'.format(env_list[2]) + '{0:.2f}'.format(values[2]),
                    (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                    (255 * int(a_idx != 2), 255, 0), 1, cv2.LINE_AA)
        cv2.imshow('classification', img)
        cv2.waitKey(1)

    env = hex_env.Hexapod(env_list,
                          max_n_envs=3,
                          specific_env_len=25,
                          s_len=200,
                          walls=False)
    env.env_change_prob = 1
    classifier = T.load(os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        "data/classifier_{}.p".format(ID)),
                        map_location='cpu')

    # Test visually
    while True:
        s = env.reset()
        h_c = None
        episode_reward = 0
        with T.no_grad():
            for i in range(env.max_steps + 400):
                env_dist, h_c = classifier(
                    (my_utils.to_tensor(s, True).unsqueeze(0), h_c))
                env_softmax = T.softmax(env_dist, 2)[0][0].numpy()
                env_idx = T.argmax(env_dist[0][0]).numpy()
                printval(env_softmax)

                act = policy_dict[env_list[env_idx]](my_utils.to_tensor(
                    s, True))

                s, r, done, _ = env.step(act[0].numpy())
                episode_reward += r
                env.render()
                #print("Env classification: {}".format(env_list[env_idx]))
        print("Episode reward: {}".format(episode_reward))
Esempio n. 5
0
    def test(self, policy, render=True, N=30, seed=None):

        # obs = np.array([1, 0, 0] * 3 + [0, 1, 0] * 3 + [0, 0, 0, 1])
        # action = policy(my_utils.to_tensor(obs, True)).detach()
        # exit()

        if seed is not None:
            self.setseed(seed)
        self.env_change_prob = 1
        rew = 0
        vel_rew = 0
        dist_rew = 0
        for i in range(N):
            obs = self.reset()
            cr = 0
            vr = 0
            dr = 0
            for j in range(int(self.max_steps)):
                #obs[0:18] = obs[0:18] + np.random.randn(18) * 0.3
                action = policy(my_utils.to_tensor(obs, True)).detach()
                obs, r, done, _ = self.step(action[0].numpy(), render=True)
                cr += r

            rew += cr
            vel_rew += vr
            dist_rew += dr
            if render:
                print("Total episode reward: {}".format(cr))
        if render:
            print("Total average reward = {}".format(rew / N))
        return rew / N, vel_rew / N, dist_rew / N
Esempio n. 6
0
    def f(w):
        rewards = []
        done = False
        obs, _ = env.reset()

        vector_to_parameters(torch.from_numpy(w).float(), policy.parameters())

        while not done:

            # Get action from policy
            with torch.no_grad():
                act = policy(my_utils.to_tensor(obs, True))

            # Step environment
            obs, rew, done, od = env.step(act.squeeze(0).numpy())

            if animate:
                env.render()

            rewards.append(od['rV'])

        r = 0
        for rew in rewards:
            rew_arr = np.array(rew)
            r += rew_arr.sum() - np.abs(rew_arr - rew_arr.mean()).mean()

        return -r
Esempio n. 7
0
    def f(w):
        reward_total = 0
        reps = 1
        vector_to_parameters(torch.from_numpy(w).float(), policy.parameters())

        for i in range(reps):
            reward = 0
            done = False
            obs = env.reset()

            h_0 = policy.init_hidden()
            while not done:

                # Get action from policy
                with torch.no_grad():
                    act, h_1 = policy((my_utils.to_tensor(obs, True), h_0))

                # Step environment
                act = act.squeeze(0).numpy()
                #act = np.array([-1,0])
                obs, rew, done, _ = env.step(act)

                if animate:
                    env.render()

                reward += rew

                h_0 = h_1

            reward_total += reward

        return - (reward_total) / reps
    def test_record_hidden(self, policy):
        self.reset()
        h_episodes = []
        for i in range(10):
            h_list = []
            obs = self.reset()
            h = None
            cr = 0
            for j in range(self.max_steps * 2):
                action, h = policy((my_utils.to_tensor(obs, True), h))
                obs, r, done, od, = self.step(action[0].detach().numpy())
                cr += r
                time.sleep(0.001)
                self.render()
                h_list.append(h[0].detach().numpy())
            print("Total episode reward: {}".format(cr))
            h_arr = np.concatenate(h_list)
            h_episodes.append(h_arr)

        h_episodes_arr = np.stack(h_episodes)

        # Save hidden states
        filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                "data/{}_states.npy".format(self.env_name))
        np.save(filename, h_episodes_arr)
Esempio n. 9
0
 def test(self, policy, render=True, N=30, seed=None):
     if seed is not None:
         self.setseed(seed)
     self.env_change_prob = 1
     rew = 0
     vel_rew = 0
     dist_rew = 0
     for i in range(N):
         obs = self.reset()
         cr = 0
         vr = 0
         dr = 0
         for j in range(int(self.max_steps)):
             action = policy(my_utils.to_tensor(obs, True)).detach()
             obs, r, done, (r_v, r_d) = self.step(action[0].numpy())
             cr += r
             vr += r_v
             dr = max(dr, r_d)
             time.sleep(0.000)
             if render:
                 self.render()
         rew += cr
         vel_rew += vr
         dist_rew += dr
         if render:
             print("Total episode reward: {}".format(cr))
     if render:
         print("Total average reward = {}".format(rew / N))
     return rew / N, vel_rew / N, dist_rew / N
Esempio n. 10
0
def make_rollout(env, policy):
    obs = env.reset()
    observations = []
    clean_actions = []
    noisy_actions = []
    rewards = []
    step_ctr_list = []
    episode_rew = 0
    step_ctr = 0
    while True:
        step_ctr_list.append(step_ctr)
        observations.append(obs)

        clean_act, noisy_act = policy.sample_action(
            my_utils.to_tensor(obs, True))
        clean_act = clean_act.squeeze(0).detach().numpy()
        noisy_act = noisy_act.squeeze(0).detach().numpy()
        obs, r, done, _ = env.step(noisy_act)

        if abs(r) > 5:
            logging.warning("Warning! high reward ({})".format(r))

        step_ctr += 1
        episode_rew += r

        if config["animate"]:
            env.render()

        clean_actions.append(clean_act)
        noisy_actions.append(noisy_act)
        rewards.append(r)
        if done: break
    terminals = [False] * len(observations)
    terminals[-1] = True
    return observations, clean_actions, noisy_actions, rewards, terminals, step_ctr_list
    def test_record(self, policy, ID):
        episode_states = []
        episode_acts = []
        for i in range(10):
            s = self.reset()
            cr = 0

            states = []
            acts = []

            for j in range(self.max_steps):
                states.append(s)
                action = policy(my_utils.to_tensor(s,
                                                   True)).detach()[0].numpy()
                acts.append(action)
                s, r, done, od, = self.step(action)
                cr += r

            episode_states.append(np.concatenate(states))
            episode_acts.append(np.concatenate(acts))

            print("Total episode reward: {}".format(cr))

        np_states = np.concatenate(episode_states)
        np_acts = np.concatenate(episode_acts)

        np.save(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "data/{}_states.npy".format(ID)), np_states)
        np.save(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "data/{}_acts.npy".format(ID)), np_acts)
Esempio n. 12
0
    def test_recurrent(self, policy):
        self.env_change_prob = 1
        self.reset()
        h_episodes = []
        for i in range(10):
            self.difficulty = 1.5
            h_list = []
            obs = self.reset()
            h = None
            cr = 0
            for j in range(self.max_steps * 3):
                action, h = policy((my_utils.to_tensor(obs,
                                                       True).unsqueeze(0), h))
                obs, r, done, od, = self.step(action[0, 0].detach().numpy() +
                                              np.random.randn(self.act_dim) *
                                              0.1)
                cr += r
                time.sleep(0.001)
                self.render()
                h_list.append(h[0][:, 0, :].detach().numpy())
            print("Total episode reward: {}".format(cr))
            h_arr = np.stack(h_list)
            h_episodes.append(h_arr)

        h_episodes_arr = np.stack(h_episodes)

        # Save hidden states
        filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                "data/{}_states.npy".format(self.env_name))
    def test_recurrent(self, policy):
        self.env_change_prob = 1
        self.reset()
        h_episodes = []
        N = 20
        rew = 0
        for i in range(N):
            h_list = []
            obs = self.reset()
            h = None
            cr = 0
            for j in range(self.max_steps):
                action, h = policy((my_utils.to_tensor(obs,
                                                       True).unsqueeze(0), h))
                obs, r, done, od, = self.step(action[0].detach().numpy())
                cr += r
                rew += r
                time.sleep(0.001)
                self.render()
                #h_list.append(h[0][:,0,:].detach().numpy())
            print("Total episode reward: {}".format(cr))
            #h_arr = np.stack(h_list)
            #h_episodes.append(h_arr)

        print("Total average reward = {}".format(rew / N))
        exit()

        h_episodes_arr = np.stack(h_episodes)

        # Save hidden states
        filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                "data/{}_states.npy".format(self.env_name))
Esempio n. 14
0
    def test_recurrent(self, policy, render=True, N=30, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.env_change_prob = 1

        rew = 0
        vel_rew = 0
        dist_rew = 0
        for i in range(N):
            obs = self.reset()
            h = None
            cr = 0
            vr = 0
            dr = 0
            for j in range(self.max_steps):
                action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h))
                obs, r, done, (r_v, r_d) = self.step(action[0].detach().numpy())
                cr += r
                vr += r_v
                dr = max(dr, r_d)

                time.sleep(0.000)
                if render:
                    self.render()

            rew += cr
            vel_rew += vr
            dist_rew += dr

            if render:
                print("Total episode reward: {}".format(cr))

        return rew / N, vel_rew / N, dist_rew / N
    def test_adapt(self, p1, p2, ID):
        self.env_list = ["flatpipe"]

        episode_states = []
        episode_acts = []
        ctr = 0
        while ctr < 1000:
            print("Iter: {}".format(ctr))
            current_policy_name = "p1"
            rnd_x = -0.1 + np.random.rand() * 0.3 + np.random.randint(0,
                                                                      2) * 1.2
            s = self.reset(init_pos=np.array([rnd_x, 0, 0]))
            cr = 0
            states = []
            acts = []

            policy = p1

            for j in range(self.max_steps):
                x = self.sim.get_state().qpos.tolist()[0]

                if 2.2 > x > 0.8 and current_policy_name == "p1":
                    policy = p2
                    current_policy_name = "p2"
                    print("Policy switched to p2")

                if not (2.2 > x > 0.8) and current_policy_name == "p2":
                    policy = p1
                    current_policy_name = "p1"
                    print("Policy switched to p1")

                states.append(s)
                action = policy(my_utils.to_tensor(s,
                                                   True)).detach()[0].numpy()
                acts.append(action)
                s, r, done, od, = self.step(action)
                cr += r

                #self.render()

            if cr < 50:
                continue
            ctr += 1

            episode_states.append(np.stack(states))
            episode_acts.append(np.stack(acts))

            print("Total episode reward: {}".format(cr))

        np_states = np.stack(episode_states)
        np_acts = np.stack(episode_acts)

        np.save(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "data/states_{}.npy".format(ID)), np_states)
        np.save(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "data/acts_{}.npy".format(ID)), np_acts)
Esempio n. 16
0
 def test(self, policy):
     for i in range(100):
         obs = self.reset()
         cr = 0
         for j in range(self.max_steps):
             action = policy(my_utils.to_tensor(obs, True)).detach()
             obs, r, done, od, = self.step(action[0].numpy())
             cr += r
             time.sleep(0.001)
             self.render()
         print("Total episode reward: {}".format(cr))
Esempio n. 17
0
 def test(self, policy):
     self.reset()
     for i in range(100):
         done = False
         obs, _ = self.reset()
         cr = 0
         while not done:
             action = policy(my_utils.to_tensor(obs, True)).detach()
             obs, r, done, od, = self.step(action[0])
             cr += r
             time.sleep(0.001)
         print("Total episode reward: {}".format(cr))
Esempio n. 18
0
 def test(self, policy):
     #self.envgen.load()
     for i in range(100):
         obs = self.reset(test=True)
         cr = 0
         for j in range(self.max_steps):
             action = policy(my_utils.to_tensor(obs, True)).detach()
             #print(action[0, :-self.mem_dim])
             obs, r, done, od, = self.step(action[0])
             cr += r
             time.sleep(0.001)
             self.render()
         print("Total episode reward: {}".format(cr))
Esempio n. 19
0
 def test_recurrent(self, policy):
     self.reset()
     for i in range(100):
         obs = self.reset()
         h = None
         cr = 0
         for j in range(self.max_steps):
             action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h))
             obs, r, done, od, = self.step(action[0, 0].detach().numpy())
             cr += r
             time.sleep(0.001)
             self.render()
         print("Total episode reward: {}".format(cr))
Esempio n. 20
0
    def test(self, policy):
        #self.envgen.load()
        self.env_change_prob = 1
        for i in range(100):
            obs = self.reset()
            done = False
            cr = 0
            while not done:
                action = policy(my_utils.to_tensor(obs, True)).detach()
                obs, r, done, od, = self.step(action[0].numpy())
                cr += r
                time.sleep(0.001)
                self.render()

            print("Total episode reward: {}".format(cr))
Esempio n. 21
0
def test_agent(env, policy):
    for _ in range(100):
        obs = env.reset()
        cum_rew = 0
        while True:
            action, noisy_action = policy.sample_action(
                my_utils.to_tensor(obs, True))
            obs, reward, done, info = env.step(
                action.detach().squeeze(0).numpy())
            cum_rew += reward
            env.render()
            if done:
                print(cum_rew)
                break
    env.close()
Esempio n. 22
0
 def test_recurrent(self, policy):
     self.reset()
     for i in range(100):
         done = False
         obs, _ = self.reset()
         h = policy.init_hidden()
         cr = 0
         while not done:
             action, h_ = policy((my_utils.to_tensor(obs, True), h))
             h = h_
             obs, r, done, od, = self.step(action[0].detach())
             cr += r
             time.sleep(0.001)
             self.render()
         print("Total episode reward: {}".format(cr))
Esempio n. 23
0
 def test(self, policy, render=True):
     N = 30
     rew = 0
     for i in range(N):
         obs = self.reset()
         cr = 0
         for j in range(int(self.max_steps)):
             action = policy(my_utils.to_tensor(obs, True)).detach()
             obs, r, done, od, = self.step(action[0].numpy())
             cr += r
             rew += r
             time.sleep(0.000)
             if render:
                 self.render()
         print("Total episode reward: {}".format(cr))
     print("Total average reward = {}".format(rew / N))
Esempio n. 24
0
 def test(self, policy):
     #self.envgen.load()
     self.env_change_prob = 1
     for i in range(100):
         obs = self.reset()
         cr = 0
         for j in range(int(self.max_steps * 1.5)):
             action = policy(my_utils.to_tensor(obs, True)).detach()
             obs, r, done, od, = self.step(action[0].numpy())
             cr += r
             time.sleep(0.001)
             self.render()
             if np.sqrt((self.prev_xy[0] - self.goal_xy[0])**2 +
                        (self.prev_xy[1] - self.goal_xy[1])**2) < 0.15:
                 break
         print("Total episode reward: {}".format(cr))
Esempio n. 25
0
 def test_recurrent(self, policy):
     total_rew = 0
     self.render_prob = 1.0
     for i in range(100):
         obs = self.reset()
         h = None
         cr = 0
         for j in range(self.max_steps):
             action, h_ = policy((my_utils.to_tensor(obs, True), h))
             h = h_
             obs, r, done, od, = self.step(action[0].detach().numpy())
             cr += r
             total_rew += r
             time.sleep(0.001)
             self.render()
         print("Total episode reward: {}".format(cr))
     print("Total reward: {}".format(total_rew))
Esempio n. 26
0
 def test_recurrent(self, policy, slow=True, seed=None):
     if seed is not None:
         np.random.seed(seed)
     total_rew = 0
     for i in range(100):
         obs = self.reset()
         h = None
         cr = 0
         for j in range(self.max_steps):
             action, h = policy((my_utils.to_tensor(obs, True).unsqueeze(0), h))
             obs, r, done, od, = self.step(action[0][0].detach().numpy())
             cr += r
             total_rew += r
             if slow:
                 time.sleep(0.01)
         print("Total episode reward: {}".format(cr))
     print("Total reward: {}".format(total_rew))
Esempio n. 27
0
 def test(self, policy, slow=True, seed=None):
     if seed is not None:
         np.random.seed(seed)
     self.render_prob = 1.0
     total_rew = 0
     for i in range(100):
         obs = self.reset()
         cr = 0
         for j in range(self.max_steps):
             action = policy(my_utils.to_tensor(obs, True)).detach()
             obs, r, done, od, = self.step(action[0].numpy())
             cr += r
             total_rew += r
             if slow:
                 time.sleep(0.01)
         print("Total episode reward: {}".format(cr))
     print("Total reward: {}".format(total_rew))
Esempio n. 28
0
    def test_agent(self, policy):
        import src.my_utils as my_utils
        for _ in range(100):
            obs = self.reset()
            cum_rew = 0
            ctr = 0
            while True:
                torso_pos_prev, torso_quat_prev, _, _, joint_angles_prev, _, _, _, _, _ = self.get_obs()
                action, _ = policy.sample_action(my_utils.to_tensor(obs, True))
                obs, reward, done, info = self.step(action.detach().squeeze(0).numpy())
                cum_rew += reward
                self.render()

                if ctr % 10 == 0 and ctr > 0 and True:
                    p.setJointMotorControlArray(bodyUniqueId=self.robot,
                                                jointIndices=range(18),
                                                controlMode=p.POSITION_CONTROL,
                                                targetPositions=[0] * 18,
                                                forces=[0] * 18,
                                                physicsClientId=self.client_ID)
                    joint_angles_desired = self.norm_to_rads(np.tanh(action.detach().squeeze(0).numpy() * 0.5))
                    for _ in range(3):
                        [p.resetJointState(self.robot, k, joint_angles_prev[k], 0, physicsClientId=self.client_ID) for k
                         in range(18)]
                        p.stepSimulation(physicsClientId=self.client_ID)
                        time.sleep(0.6)

                        [p.resetJointState(self.robot, k, joint_angles_desired[k], 0, physicsClientId=self.client_ID)
                         for k in range(18)]
                        p.stepSimulation(physicsClientId=self.client_ID)
                        time.sleep(0.6)

                    [p.resetJointState(self.robot, k, joint_angles_prev[k], 0, physicsClientId=self.client_ID) for k in
                     range(18)]
                    p.stepSimulation(physicsClientId=self.client_ID)

                ctr += 1

                if done:
                    print(cum_rew)
                    break
        env.close()
Esempio n. 29
0
    def test_recurrent(self, policy):
        self.reset()
        for i in range(100):
            done = False
            obs = self.reset()
            h = policy.init_hidden()
            cr = 0
            self.max_steps = 600
            import matplotlib.pyplot as plt
            #fig = plt.figure()
            acts = []
            while not done:
                action, h_ = policy((my_utils.to_tensor(obs, True), h))
                acts.append(action[0].detach())
                h = h_
                obs, r, done, od, = self.step(action[0].detach())
                cr += r
                time.sleep(0.001)
                self.render()

            print("Total episode reward: {}".format(cr))
 def make_rollout(self, policy):
     self.env.set_randomize_env(False)
     obs = self.env.reset()
     observations = []
     clean_actions = []
     noisy_actions = []
     rewards = []
     while True:
         observations.append(obs)
         clean_act, noisy_act = policy.sample_action(
             my_utils.to_tensor(obs, True))
         clean_act = clean_act.squeeze(0).detach().numpy()
         noisy_act = noisy_act.squeeze(0).detach().numpy()
         obs, r, done, _ = self.env.step(noisy_act)
         clean_actions.append(clean_act)
         noisy_actions.append(noisy_act)
         rewards.append(r)
         if done: break
     terminals = [False] * len(observations)
     terminals[-1] = True
     return observations, clean_actions, noisy_actions, rewards, terminals