Python State Examples, environment.State Python Examples

Example #1

0

Show file

    def __init__(self, parent=None):
        tk.Frame.__init__(self, parent)
        self.grid(column=0, row=0)
        self.columnconfigure(0, weight=1)
        self.rowconfigure(0, weight=1)
        self.parent = parent

        self.test_problem = State()
        self.test_fringe = Queue()
        self.problem = State()
        self.solution = tree_search(self.test_problem, self.test_fringe)
        self.solution_stage = 0

        self.connor_hp = tk.IntVar()
        self.arnold_hp = tk.IntVar()

        self.connor_defense = tk.StringVar()
        self.arnold_defense = tk.StringVar()

        self.update()

        name_column = 1
        data_column = 2

        self.ui_label(1, name_column, 'Connor')
        self.ui_label(4, name_column, 'Arnold')
        self.ui_label(2, name_column, 'Defense')
        self.ui_label(5, name_column, 'Defense')
        self.ui_label(1, data_column, self.connor_hp, True)
        self.ui_label(4, data_column, self.arnold_hp, True)
        self.ui_label(2, data_column, self.connor_defense, True)
        self.ui_label(5, data_column, self.arnold_defense, True)

        self.ui_btn(6, data_column, 'Next', self.resolve)

Example #2

0

Show file

def loop(n):
    logger_her.info("***************************")
    logger_her.info("**** Bit flipping game ****")
    logger_her.info("***************************")

    logger_her.info("Start main loop with size {}".format(n))
    logger_her.info("HER STATUS: {}".format(HER))

    actor = QModel(n, HER)
    critic = QModel(n, HER)

    if not TRAIN_FROM_SCRATCH:
        actor.load()
        critic.load()
    else:
        logger_her.info("Training QNetworks from scratch")

    re_buffer = Buffer(BUFFER_SIZE)

    for epoch in range(EPOCHS):
        logger_her.info("Start epoch {}".format(epoch + 1))

        for episode_idx in range(EPISODES):
            goal = State.sample_status(n)
            start = State.sample_status(n)

            # here we will going to store start and goal in a state object
            state = State(start, goal)

            _, episode = sample_episode(actor, state, epsilon_greedy=True)
            re_buffer.add(episode)

            if HER:
                new_experience = []
                for s, a, r, sn in episode:
                    for t in _sample(n, HER_NEW_GOALS):
                        _g = episode[t][-1].status
                        _sn = State(sn.status.copy(), _g.copy())

                        exp = (State(s.status.copy(), _g.copy()), a, 0 if _sn.is_final else -1, _sn)

                        new_experience.append(exp)

                re_buffer.add(new_experience)

        for training_step in range(TRAINING_STEPS):
            minibatch = re_buffer.sample(BATCH_SIZE)
            train(critic, actor, minibatch)

        if (epoch + 1) % UPDATE_ACTOR == 0:
            actor.update(critic)

            success_rate = evaluate_actor(actor)

            re_buffer.log_stats()

            if success_rate >= 1. - 1e-9:
                logger_her.info("Learned policy (QAction-Value) for {} bits in {} epochs".format(n, epoch + 1))
                break

Example #3

0

Show file

File: serialize.py Project: mpaluta/Tetris-Reinforcement-Learning

 def deserialize_json(self, o):
     # return s,a,r,sprime,pfbm tuple
     s = State.deserialize_json(o["s"])
     sprime = State.deserialize_json(o["sprime"])
     a = o["a"]
     r = o["r"]
     pfbm = None if o["pfbm"] is None else np.asarray(o["pfbm"])
     return (s, a, r, sprime, pfbm)

Example #4

0

Show file

File: serialize.py Project: mkayser/tetris_rl

 def deserialize_json(self, o):
     # return s,a,r,sprime,pfbm tuple
     s = State.deserialize_json(o["s"])
     sprime = State.deserialize_json(o["sprime"])
     a = o["a"]
     r = o["r"]
     pfbm = None if o["pfbm"] is None else np.asarray(o["pfbm"])
     return (s,a,r,sprime,pfbm)

Example #5

0

Show file

class AoasUI(tk.Frame):
    def __init__(self, parent=None):
        tk.Frame.__init__(self, parent)
        self.grid(column=0, row=0)
        self.columnconfigure(0, weight=1)
        self.rowconfigure(0, weight=1)
        self.parent = parent

        self.test_problem = State()
        self.test_fringe = Queue()
        self.problem = State()
        self.solution = tree_search(self.test_problem, self.test_fringe)
        self.solution_stage = 0

        self.connor_hp = tk.IntVar()
        self.arnold_hp = tk.IntVar()

        self.connor_defense = tk.StringVar()
        self.arnold_defense = tk.StringVar()

        self.update()

        name_column = 1
        data_column = 2

        self.ui_label(1, name_column, 'Connor')
        self.ui_label(4, name_column, 'Arnold')
        self.ui_label(2, name_column, 'Defense')
        self.ui_label(5, name_column, 'Defense')
        self.ui_label(1, data_column, self.connor_hp, True)
        self.ui_label(4, data_column, self.arnold_hp, True)
        self.ui_label(2, data_column, self.connor_defense, True)
        self.ui_label(5, data_column, self.arnold_defense, True)

        self.ui_btn(6, data_column, 'Next', self.resolve)

    def resolve(self):
        if self.solution_stage < len(self.solution):
            print(self.solution[self.solution_stage])
            self.problem.agent_action(self.solution[self.solution_stage])
            self.update()
            self.solution_stage += 1

    def update(self):
        self.connor_hp.set(self.problem.connor.hp)
        self.arnold_hp.set(self.problem.terminator.hp)
        self.connor_defense.set(self.problem.connor.defense)
        self.arnold_defense.set(self.problem.terminator.defense)

    def ui_label(self, row, column, text, textvariable=False):
        if textvariable == False:
            tk.Label(self, text=text).grid(column=column, row=row)
        else:
            tk.Label(self, textvariable=text).grid(column=column, row=row)

    def ui_btn(self, row, column, text, command):
        tk.Button(self, text=text, command=command).grid(column=column,
                                                         row=row)

Example #6

0

Show file

def mc_control(num_episodes=10000):
    q_sa = {}
    p = {}
    n_s = {}
    n_sa = {}
    n0 = 100

    for _ in range(num_episodes):
        state = State()
        reward = 0
        episode_s = []
        episode_sa = []

        while not state.terminal:
            s = state.as_tuple()
            if s in p:
                a = sample_action(p[s])
            else:
                a = Action.random()

            episode_s.append(s)
            episode_sa.append(s + (a, ))
            state, reward = step(state, a)

            ns = n_s.get(s, 0)
            n_s[s] = ns + 1

            sa = s + (a, )
            nsa = n_sa.get(sa, 0)
            n_sa[sa] = nsa + 1

        # GLIE MC Control
        for sa in set(episode_sa):
            nsa = n_sa[sa]
            qsa = q_sa.get(sa, 0)
            q_sa[sa] = qsa + ((reward - qsa) / nsa)

        # Improve policy
        for s in set(episode_s):
            a_best = greedy_action(q_sa, s)
            ns = n_s.get(s, 0)
            epsilon = n0 / (n0 + ns)

            selection_probs = []
            for a in list(Action):
                if a is a_best:
                    selection_probs.append(1 - epsilon + epsilon / len(Action))
                else:
                    selection_probs.append(epsilon / len(Action))
            p[s] = selection_probs
    return q_sa

Example #7

0

Show file

File: lfa.py Project: zhan0903/easy21

def expand_Q(w):
    Q = np.zeros((10, 21, 2))

    for dealer in DEALER_RANGE:
        for player in PLAYER_RANGE:
            for action in ACTIONS:
                #state = (dealer, player)
                state = State()
                state.dealercard = dealer
                state.playersum = player
                feats = phi(state, action)
                Q[dealer - 1, player - 1][action] = np.sum(feats * w)

    return Q

Example #8

0

Show file

def evaluate_actor(actor, episodes_count=TESTING_EPISODES, verbose=0, pause=0):
    success_counter = 0

    for episode_ev in range(episodes_count):
        start = State.sample_status(actor.n)
        goal = State.sample_status(actor.n)
        success, _ = sample_episode(actor, State(start, goal), epsilon_greedy=False, verbose=verbose)
        success_counter += int(success)

        if pause: input("Press <Enter> to continue...")

    logger_her.info("Success/Total {}/{}".format(success_counter, episodes_count))
    logger_her.info("Success rate: {}".format(success_counter / episodes_count))

    return success_counter / episodes_count

Example #9

0

Show file

File: agent.py Project: gairTanm/easy-21

 def get_value_function(self):
     for i in range(1, self.env.dealer_max_value + 1):
         for j in range(1, self.env.agent_max_value + 1):
             s = State(j, i)
             print(s.dealer_sum, s.agent_sum)
             self.V[i][j] = self.get_max_action(s)
     return self.V

Example #10

0

Show file

File: agent.py Project: TheGhostHuCodes/RLiM

 def __init__(self,
              states,
              alpha: float = 0.15,
              random_factor: float = 0.2):
     self.state_history = [(State(0, 0), 0)]
     self.alpha = alpha
     self.random_factor = random_factor
     self.G = Agent.init_reward(states)

Example #11

0

Show file

File: agent.py Project: microsoft/glider_tasklet_crawler

 def _load_task(self, task_dict, states_dir):
     task = Task(resume_utg=False, **task_dict)
     for i in range(len(task_dict["state_history"])):
         state_str = task_dict["state_history"][i]
         action_str = task_dict["action_history"][i]
         state = State.load(state_dir=states_dir, state_str=state_str)
         state.setup(task)
         action = self._load_action(state, action_str)
         task.state_history.append(state)
         task.action_history.append(action)
     task.state = State.load(state_dir=states_dir,
                             state_str=task_dict["state"])
     task.state.setup(task)
     task.reward = task_dict["reward"]
     task.total_reward = task_dict["total_reward"]
     task.done = task_dict["done"]
     return task

Example #12

0

Show file

def Lfa():
    lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    learning_curves = {}
    state1 = State()
    num_episodes = 2000

    with open("./Q_dump_episodes_1000000.pkl", "rb") as f:
        opt_value = pickle.load(f)

    for item in lmbd:
        state1.dealercard = random.randint(1,10)
        state1.playersum = random.randint(1,10)
        Q_value,error_history = lfa_learn(item,opt_value,num_episodes)
        learning_curves[item] = error_history

    plot_file = ("./outcome/lfa_error_{}_episodes_time_{}.pdf".format(20000,time.time()))
    plot_learning_curve(learning_curves, save=plot_file)

Example #13

0

Show file

File: part3.py Project: dteoh/easy21

def sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, yield_progress=False):
    q_sa = {}
    n_s = {}
    n_sa = {}

    for n in range(num_episodes):
        e_sa = {}
        state = State()
        s = state.as_tuple()
        a = epsilon_greedy_action(q_sa, s, calculate_epsilon(n_s, s))
        while not state.terminal:
            state, reward = step(state, a)
            n_s[s] = n_s.get(s, 0) + 1

            s_next = state.as_tuple()
            a_next = epsilon_greedy_action(q_sa, s_next,
                                           calculate_epsilon(n_s, s_next))

            sa = s + (a, )
            sa_next = s_next + (a_next, )
            qsa = q_sa.get(sa, 0)
            qsa_next = q_sa.get(sa_next, 0)

            nsa = n_sa.get(sa, 0) + 1
            n_sa[sa] = nsa

            delta = reward + gamma * qsa_next - qsa
            e_sa[sa] = e_sa.get(sa, 0) + 1
            for (s, a) in generate_all_state_action_pairs():
                sa = s + (a, )
                q_sa[sa] = q_sa.get(sa, 0) + (delta * e_sa.get(sa, 0)) / nsa
                e_sa[sa] = gamma * lamba * e_sa.get(sa, 0)

            s = s_next
            a = a_next

        if yield_progress:
            yield n + 1, q_sa

    if not yield_progress:
        yield num_episodes, q_sa

Example #14

0

Show file

File: sarsa.py Project: zhan0903/easy21

def Sarsa_lamda_Control(lmbd, opt_value, num_episodes):
    #initialize
    value = np.zeros((10, 21, 2))
    counter = np.zeros((10, 21, 2))
    totalreward = 0
    error_history = []

    for episode in range(1, num_episodes + 1):
        # initialize env
        state1 = State()
        state1.dealercard = random.randint(1, 10)
        state1.playersum = random.randint(1, 10)

        E = np.zeros((10, 21, 2))
        while state1 != "terminal":
            action1 = Epsilon_greedy_policy(value, counter, state1)
            state2, reward = Step(state1, action1)
            idx1 = (state1.dealercard - 1, state1.playersum - 1, action1)
            Q1 = value[idx1]

            if state2 == "terminal":
                Q2 = 0.0
            else:
                action2 = Policy(value, counter, state2)
                idx2 = (state2.dealercard - 1, state2.playersum - 1, action2)
                Q2 = value[idx2]

            counter[idx1] += 1
            E[idx1] += 1

            alpha = 1.0 / counter[idx1]
            delta = reward + GAMMA * Q2 - Q1

            value += alpha * delta * E
            E *= GAMMA * lmbd

            state1 = state2

        error_history.append((episode, mse(value, opt_value)))

    return value, error_history

Example #15

0

Show file

def Sarsa():
    lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    #lmbd = [0.1]
    learning_curves = {}
    state1 = State()
    num_episodes = 20000

    with open("./Q_dump_episodes_1000000.pkl", "rb") as f:
        opt_value = pickle.load(f)
    for item in lmbd:
        #print("in main, item:",item)
        state1.dealercard = random.randint(1,10)
        state1.playersum = random.randint(1,10)
        #print("state in main:",state1.dealercard,state1.playersum)
        Q_value,error_history = Sarsa_lamda_Control(item,opt_value,num_episodes)
        learning_curves[item] = error_history
        #print("learning_curves:",learning_curves)


    plot_file = ("./outcome/Sarsa_error_{}_episodes_time_{}.pdf".format(20000,time.time()))
    plot_learning_curve(learning_curves, save=plot_file)

Example #16

0

Show file

 def compute_reward(task, trace_lines):
     # logging.info(f"compute_reward starts at {datetime.now()}")
     states = []
     actions = []
     # browser.reset(task.start_url)
     state_action_lines = [(line[:(line.find(": "))],
                            line[(line.find(": ") + 2):])
                           for line in trace_lines]
     current_state_str, action_line = state_action_lines[0]
     current_state = State.load(states_dir, current_state_str)
     actions.append("RESET")
     states.append(current_state)
     task.reset(current_state, update_utg=False)
     last_action = load_action(current_state, action_line)
     actions.append(action_line)
     end_reached = False
     correct_rewards = [0]
     incorrect_rewards = [task.total_reward]
     for state_str, action_line in state_action_lines[1:]:
         current_state = State.load(states_dir, state_str)
         states.append(current_state)
         task.update(last_action, current_state, update_utg=False)
         if task.target_achieved:
             correct_rewards.append(task.total_reward)
         else:
             incorrect_rewards.append(task.total_reward)
         if action_line == "END":
             end_reached = True
             break
         else:
             last_action = load_action(current_state, action_line)
     max_correct_reward = max(correct_rewards)
     max_incorrect_reward = max(incorrect_rewards)
     logging.info(
         f"  task got correct reward {max_correct_reward:6.3f}"
         f" and incorrect reward {max_incorrect_reward:3.3f}: {task.name}"
     )
     return max_correct_reward, max_incorrect_reward

Example #17

0

Show file

File: agent_test.py Project: snikolov/rl-ttt

def test_step_normal_Q_learning():
    # Test case when there is a previous state and action but the game
    # is not done. Check that Q is updated correctly in the
    # deterministic greedy case (epsilon=0).

    alpha = 0.1
    gamma = 0.9
    agent = TDAgent(0, alpha=alpha, gamma=gamma, epsilon0=0, method='q-learning')
    agent.prev_action = (1,1)
    agent.prev_state = State()
    prev_afterstate = agent.prev_state.put(agent.prev_action, 0)

    # Construct a Q function to force a spefic update.
    # Value before learning.
    q_prev = 0.6
    agent.Q[prev_afterstate] = q_prev
    curr_state = State().put((1,1), 0).put((0,0), 1)
    # Create two possible actions, one better than the other. The
    # agent should choose to use the value of taking action (2,2) for
    # the target value.
    agent.Q[curr_state.put((2,2), 0)] = 0.7
    agent.Q[curr_state.put((0,2), 0)] = 0.5
    future_return = gamma * 0.7

    # Take a step.  We are not done, but give a nonzero reward to
    # check that it is used.
    reward = 1
    done = False
    action = agent.step(curr_state, reward, done)

    # Value after learning
    q_curr = agent.Q[prev_afterstate]
    print q_prev, q_curr

    assert q_curr == q_prev + alpha * (
        reward + future_return - q_prev
    )

Example #18

0

Show file

File: lfa.py Project: zhan0903/easy21

def lfa_learn(lmbd, opt_value, num_episodes):
    #initialize
    Q = np.zeros((10, 21, 2))
    counter = np.zeros((10, 21, 2))
    totalreward = 0
    error_history = []
    w = (np.random.rand(*FEATS_SHAPE) - 0.5) * 0.001

    for episode in range(1, num_episodes + 1):
        # initialize env
        state1 = State()
        state1.dealercard = random.randint(1, 10)
        state1.playersum = random.randint(1, 10)
        #state1 = (state1.dealercard,state1.playersum)
        E = np.zeros_like(w)

        while state1 != "terminal":
            Qhat1, action1 = policy(state1, w)
            state2, reward = Step(state1, action1)
            Qhat2, action2 = policy(state2, w)

            feats1 = phi(state1, action1)
            grad_w_Qhat1 = feats1

            delta = reward + GAMMA * Qhat2 - Qhat1
            E = GAMMA * lmbd * E + grad_w_Qhat1
            dw = ALPHA * delta * E

            w += dw
            state1 = state2

            Q = expand_Q(w)
            #print("in lfa while")
            error_history.append((episode, mse(Q, opt_value)))

    return Q, error_history

Example #19

0

Show file

File: agent.py Project: TheGhostHuCodes/RLiM

 def choose_action(self, state: State,
                   allowed_moves: List[Action]) -> Action:
     max_G = -10e15
     next_move = None
     random_N = np.random.random()
     if random_N < self.random_factor:
         next_move = np.random.choice(allowed_moves)
     else:
         for action in allowed_moves:
             y = Maze.action_space[action].dy
             x = Maze.action_space[action].dx
             new_state = State(state.x + x, state.y + y)
             if self.G[new_state] >= max_G:
                 next_move = action
                 max_G = self.G[new_state]
     return next_move

Example #20

0

Show file

def trial(robot: Agent) -> List[int]:
    maze = Maze()
    move_history = []
    for i in range(5000):
        if i % 1000 == 0:
            print(i)
        while not maze.is_complete():
            state, _ = maze.get_state_and_reward()
            action = robot.choose_action(state, maze.allowed_states[state])
            maze.update_maze(action)
            state, reward = maze.get_state_and_reward()
            robot.update_state_history(state, reward)
            if maze.steps > 1000:
                maze.robot_position = State(5, 5)
        robot.learn()
        move_history.append(maze.steps)
        maze.reset()
    return move_history

Example #21

0

Show file

def successor_function(state):
    action = state.functions
    result = []

    # Simulate the actions, then return the state of the 2 actions
    for x in range(len(action)):
        # Create a new state with new values, very important so that the address in the RAM will be new
        new_state = State()
        new_state.connor.hp = state.connor.hp
        new_state.terminator.hp = state.terminator.hp
        new_state.terminator.defense = state.terminator.defense
        connor = new_state.connor
        terminator = new_state.terminator

        # Simulate the actions then append the result in new_state
        if action[x] == 'attack':
            connor.attack(terminator)
            result.append(new_state)
        else:
            connor.defend(terminator)
            result.append(new_state)

    return action, result

Example #22

0

Show file

File: part4.py Project: dteoh/easy21

def lfa_sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, alpha=0.01, yield_progress=False):

    # Set up the coarse codes, initial weights.
    action_codes = {}
    for action in list(Action):
        action_fns = []
        for dealer_interval in [(1,4), (4,7), (7,10)]:
            for player_interval in [(1,6), (4,9), (7,12), (10,15), (13,18), (16,21)]:
                cuboid_fn = create_cuboid_fn(dealer_interval, player_interval, action)
                action_fns.append(cuboid_fn)
        action_codes[action] = action_fns

    def greedy(s, w):
        p, d = s
        action_values = []
        for a in list(Action):
            value = 0
            for cuboid_fn in action_codes[a]:
                if cuboid_fn(p, d, a):
                    value += w.get(cuboid_fn, 0)
            action_values.append((a, value))
        action_values.sort(key=itemgetter(1), reverse=True)
        return action_values[0][0]

    def e_greedy(s, w, epsilon=0.05):
        a_best = greedy(s, w)
        selection_probs = []
        default_p = epsilon / len(Action)
        for a in list(Action):
            if a is a_best:
                selection_probs.append(1 - epsilon + default_p)
            else:
                selection_probs.append(default_p)
        return sample_action(selection_probs)

    def f_sa(s, a):
        p, d = s
        for cuboid_fn in action_codes[a]:
            if cuboid_fn(p, d, a):
                yield cuboid_fn

    def compile_q_sa(w):
        q_sa = {}
        for (p, d), a in generate_all_state_action_pairs():
            sa = (p, d, a)
            val = 0
            for i in f_sa((p, d), a):
                val += w.get(i, 0)
            q_sa[sa] = val
        return q_sa

    w_f = {}
    for n in range(num_episodes):
        state = State()
        s = state.as_tuple()
        a = e_greedy(s, w_f)
        z_f = {}
        while not state.terminal:
            state, reward = step(state, a)
            delta = reward
            for i in f_sa(s, a):
                delta = delta - w_f.get(i, 0)
                z_f[i] = z_f.get(i, 0) + 1
            if state.terminal:
                for i, zi in z_f.items():
                    w_f[i] = w_f.get(i, 0) + alpha * delta * zi
                break
            s_next = state.as_tuple()
            a_next = e_greedy(s_next, w_f)
            for i in f_sa(s_next, a_next):
                delta = delta + gamma * w_f.get(i, 0)
            for i, zi in z_f.items():
                w_f[i] = w_f.get(i, 0) + alpha * delta * zi
                z_f[i] = gamma * lamba * zi
            s = s_next
            a = a_next
        if yield_progress:
            yield n+1, compile_q_sa(w_f)

    if not yield_progress:
        yield num_episodes, compile_q_sa(w_f)

Example #23

0

Show file

from environment import State
from agents import ComputerPlayer, HumanPlayer

if __name__ == "__main__":
    # play with human
    p1 = ComputerPlayer("computer", exp_rate=0)
    p1.load_policy("policy_p1")

    p2 = HumanPlayer("human")

    st = State(p1, p2)
    st.play_with_human()

Example #24

0

Show file

File: train.py Project: GregSzopinski/deep_reinforcement_learning

from environment import State
from agents import ComputerPlayer

if __name__ == "__main__":
    # training
    p1 = ComputerPlayer("p1")
    p2 = ComputerPlayer("p2")

    st = State(p1, p2)
    print("training...")
    st.play_with_ai(50000)

    p1.save_policy()
    p2.save_policy()

Example #25

0

Show file

from environment import State
from utils import Queue
from ai import bfs, ids, best_fs, a_star_search
from ui import AoasUI
import tkinter as tk
from tkinter import ttk

if __name__ == '__main__':
    # Initialize all important things
    problem = State()
    fringe = Queue()
    next_gen = Queue()
    depth = 3
    count = 0

    saved_input = None
    user_input = None

    # Manual algorithm switching
    manual = False

    # While the solution has not been found, do all of this
    solved = False
    while solved == False:
        # saved_input to automate the entire thing for long processes
        if saved_input == None and manual == False:
            saved_input = input(
                '"1" for BFS, "2" for IDS, "3" for Greedy BFS, "4" for A* Search: '
            )
        elif manual == True:
            saved_input = input(

Example #26

0

Show file

File: run.py Project: tuliplan/RL_state_preparation

from environment import State
from Net_pg import PolicyGradient
import numpy as np

N = 20
env = State()

RL = PolicyGradient(
    n_actions=env.n_actions,
    n_features=env.n_features,
    learning_rate=0.01,
    reward_decay=0.99,
)

fid_max = 0
for episode in range(500):

    observation = env.reset()
    for ii in range(N):

        action = RL.choose_action(observation)
        observation_, reward, done, fid = env.step(action, ii)
        RL.store_transition(observation, action, reward)
        observation = observation_

        if done or ii >= N - 1:

            break

    if episode >= 490:
        if fid > fid_max: