Python MountainCar Exemples, mountaincar.MountainCar Python Exemples

Exemple #1

0

Afficher le fichier

    def __init__(self, n_neurons, tau, eta, gamma, lambda_eligibility):

        self.mountain_car = mountaincar.MountainCar()

        # Parameters
        self.n_neurons = n_neurons
        self.tau = tau
        self.lambda_eligibility = lambda_eligibility
        self.gamma = gamma
        self.eta = eta

        # Defines the neural lattice
        self.neurons_pos = np.linspace(-150, 30, n_neurons)
        print(self.neurons_pos)
        self.sigma_pos = self.neurons_pos[1] - self.neurons_pos[0]
        print(self.sigma_pos)
        self.neurons_vel = np.linspace(-15, 15, n_neurons)
        print(self.neurons_vel)
        self.sigma_vel = self.neurons_vel[1] - self.neurons_vel[0]
        print(self.sigma_vel)
        self.pos_grid, self.vel_grid = np.meshgrid(self.neurons_pos,
                                                   self.neurons_vel)

        # initialize the Q-values etc.
        self._init_run()

Exemple #2

0

Afficher le fichier

    def __init__(self, mountain_car=None):

        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        self.nn = NeuralNet(20, 20)

Exemple #3

0

Afficher le fichier

Fichier : starter.py Projet : PaluchowskiMatthew/GuacOverflow

    def __init__(self, mountain_car = None, parameter1 = 3.0):
        
        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        self.parameter1 = parameter1

Exemple #4

0

Afficher le fichier

Fichier : sarsa.py Projet : PaluchowskiMatthew/GuacOverflow

    def __init__(self,
                 mountain_car=None,
                 size=20,
                 eta=0.05,
                 gamma=0.99,
                 tau=1,
                 eligibity_trace_decay=0.95,
                 tau_decay=True,
                 non_zero_weights=False):
        # GridWorld / neural net size
        self.N = size

        self.t = 0

        # reward administered t the target location and when
        # bumping into walls
        self.reward_at_target = 1.

        # learning rate
        self.eta = eta

        # discount factor - quantifies how far into the future
        # a reward is still considered important for the
        # current action
        self.gamma = gamma

        # the decay factor for the eligibility trace the
        # default is 0., which corresponds to no eligibility
        # trace at all.
        self.eligibity_trace_decay = eligibity_trace_decay

        # Exploration parameter
        self.tau = tau
        self.tau_decay = tau_decay

        # Grid centers
        x_centers = np.linspace(-150, 30, self.N)
        dx_centers = np.linspace(-15, 15, self.N)

        # Variance for the input function of the centers
        self.var_x = ((150 + 30) / self.N)**2
        self.var_dx = ((15 + 15) / self.N)**2

        # Create grid given the centers
        self.x_grid, self.dx_grid = np.meshgrid(x_centers, dx_centers)

        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        # initialize the weights and eligibility traces
        if non_zero_weights:
            self.w = np.ones((3, self.N**2))
        else:
            self.w = np.zeros((3, self.N**2))

        self._reset_e_values()

Exemple #5

0

Afficher le fichier

Fichier : short_version.py Projet : mshalvagal/unsupervised_rl_nn2018

    def __init__(self,
                 mountain_car=None,
                 eta=0.01,
                 tau=lambda x: 0.01,
                 lambd=0.98,
                 weight=0.5,
                 seed=11):
        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        ### SARSA parameters
        self.tau = tau  # temperature -> exploration vs exploitation parameter.
        self.eta = eta  # learning rate for weight update << 1
        self.lambd = lambd  # eligibility decay rate 0 < lambda < 1
        self.gamma = 0.95  # reward factor
        self.weights_hist = [[], []]

        ### setting up the neural network
        # computing interval parameters
        self.nNeuronsX = 20  # minimum 2
        self.nNeuronsPsi = 20
        self.inputDim = self.nNeuronsX * self.nNeuronsPsi
        self.outputDim = 3
        self.xCenters = np.linspace(
            -150, 30, self.nNeuronsX + 1,
            False)[1:]  # split the position interval excluding extreme values
        self.psiCenters = np.linspace(
            -15, 15, self.nNeuronsPsi + 1,
            False)[1:]  # split the speed interval excluding extreme values
        self.sigmaX = self.xCenters[1] - self.xCenters[0]
        self.sigmaPsi = self.psiCenters[1] - self.psiCenters[0]

        # generating the input neurons
        iN = []
        for psic, psi in enumerate(self.psiCenters):
            iN.append([])
            for xc, x in enumerate(self.xCenters):
                iN[psic].append(S(x, psi))
        self.iN = np.transpose(np.array(iN))

        # listing the actions
        self.ActDict = [-1, 0, 1]

        np.random.seed(seed)

        self.weights = np.ones(
            (3, self.iN.shape[0], self.iN.shape[1])) * weight

        self.eligibilities = np.zeros((3, self.iN.shape[0], self.iN.shape[1]))

Exemple #6

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser(
        description="Policy Evaluation Random Dyna")
    parser.add_argument("--cfg",
                        type=str,
                        default='config.json',
                        help="config file name")
    args = parser.parse_args()
    fname = args.cfg
    config = update_config(fname)
    tile = MountainCarTileCoder(iht_size=10000, num_tilings=10, num_tiles=8)
    theta = np.random.uniform(-0.001, 0, size=(tile.n))
    F = np.zeros((tile.n, tile.n))
    b = np.zeros((tile.n))
    alpha = config["alpha"]
    gamma = config["gamma"]
    epsilon = config["epsilon"]
    N_0 = config["N_0"]
    numEpisodes = config["numEpisodes"]
    stepsPerEpisode = config["stepsPerEpisode"]
    n = config["n_planning_steps"]
    loss = []
    env = mountaincar.MountainCar()

    for episodeNum in tqdm(range(1, numEpisodes + 1)):
        G = 0
        env.init()
        state = env.start()
        #print(episodeNum, ":\n")
        for step in range(stepsPerEpisode):
            phi = tile.get_tiles(position=state[0], velocity=state[1])
            phi = tile.get_one_hot(phi)
            action = policy(state, epsilon)
            reward, state2, done = env.step(action)
            phi_prime = tile.get_tiles(position=state2[0], velocity=state2[1])
            phi_prime = tile.get_one_hot(phi_prime)

            G += reward
            delta = reward + (gamma * (theta.T @ phi_prime)) - (theta.T @ phi)
            #modelling
            F = F + alpha * np.outer((phi_prime - np.dot(F, phi)), phi)
            b = b + alpha * ((reward - b.T @ phi) * phi)
            theta += alpha * delta * phi
            #plan
            theta = planning(n, theta, F, b, tile, gamma, alpha)
            state = state2
        alpha = alpha * ((N_0 + 1) / (N_0 + (episodeNum)**1.1))
        loss.append(delta**2)
    stname = "losses" + "_n0_" + "{}".format(
        config["N_0"]) + "_alpha" + "{}".format(config["alpha"])
    np.save(stname + 'loss', loss)

Exemple #7

0

Afficher le fichier

 def __init__(self,
              mc=None,
              net=None,
              temp=None,
              learn_rate=1e-2,
              reward_factor=0.95,
              el_tr_rate=None,
              temp_fun=None):
     self.mc = mountaincar.MountainCar() if mc is None else mc
     self.net = Network() if net is None else net
     self.temp0 = 0.1 if temp is None else temp
     self.learn_rate = learn_rate
     self.reward_factor = reward_factor
     self.el_tr_rate = 0.95 if el_tr_rate is None else el_tr_rate
     self.temp_fun = temp_fun

Exemple #8

0

Afficher le fichier

    def __init__(self, seed=1):
        self.rnd = np.random.RandomState(seed)
        self.mountain_car = mountaincar.MountainCar(self.rnd)
        
        # Initialize constants
        self.x_min = -150.0
        self.x_max = 30.0
        self.x_n = 20 # Number of subdivisions along the position axis

        self.v_min = -15.0
        self.v_max = 15.0
        self.v_n = 5 # Number of subdivisions along the speed axis
        
        self.x_centers = np.linspace(self.x_min, self.x_max, self.x_n)
        self.v_centers = np.linspace(self.v_min, self.v_max, self.v_n)
        self.x_sigma = self.x_centers[1] - self.x_centers[0]
        self.v_sigma = self.v_centers[1] - self.v_centers[0]

Exemple #9

0

Afficher le fichier

Fichier : sarsa_agent.py Projet : mshalvagal/unsupervised_rl_nn2018

    def __init__(self,
                 mountain_car=None,
                 temperature=0.1,
                 trace_decay_rate=0.95,
                 grid_size=20,
                 num_actions=3,
                 learning_rate=1e-2,
                 weight_init='constant',
                 weight=0.5):

        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        if weight_init == 'uniform':
            self.network_weights = np.random.rand(num_actions,
                                                  grid_size * grid_size)
        elif weight_init == 'constant':
            self.network_weights = weight * np.ones(
                (num_actions, grid_size * grid_size))
        self.activations = np.zeros((grid_size, grid_size))
        self.activations1 = np.zeros((grid_size, grid_size))

        xcenters = np.linspace(-150, 30, grid_size + 1, False)[1:]
        xdcenters = np.linspace(-15, 15, grid_size + 1, False)[1:]
        self.centers = np.array(
            np.meshgrid(xcenters, np.flip(xdcenters, axis=0)))

        self.sigma = np.zeros(2)
        self.sigma[0] = xcenters[1] - xcenters[0]
        self.sigma[1] = xdcenters[1] - xdcenters[0]

        self.temperature = temperature
        self.discount_factor = 0.95
        self.learning_rate = learning_rate
        self.trace_decay_rate = trace_decay_rate
        self.eligibility_traces = np.zeros_like(self.network_weights)

        self.a = 0.0
        self.a1 = 0.0
        self.r = 0.0
        self.q = 0.0
        self.q1 = 0.0

Exemple #10

0

Afficher le fichier

    def __init__(self, mountain_car=None):

        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

Exemple #11

0

Afficher le fichier

Fichier : watkinsq.py Projet : jk370/mountain-car

def watkins_q(alpha=0.15, epsilon=0, gamma=1, lamb=0.9, episodes=100, trace="replace"):
    '''Performs linear, gradient-descent Sarsa learning'''
    # Perform initializations
    episode_rewards = []
    env = mountaincar.MountainCar()
    tiles = make_tiles()
    action_size = len(tiles)
    tile_overlap = len(tiles[0][0])
    tile_size = len(tiles[0][0][0])*len(tiles[0][1][0])
    tile_number = action_size*tile_size*tile_overlap
    theta = np.zeros(tile_number)
    
    # Run number of episodes for learning
    for _ in range(episodes):
        env.reset()
        total_reward = 0
        eligibility = np.zeros(tile_number)
        env_state = (env.position, env.velocity)
        chosen_action = np.random.choice(env.actions)
        features = find_features(tiles, env_state, chosen_action)
        
        # Repeat for each step in episode
        while not env.game_over:
            # Action the appropriate trace
            for i in range(len(features)):
                # Convert 3D state-action tile features to 1D index for theta table
                pos_index, vel_index = features[i]
                index = np.ravel_multi_index([pos_index, vel_index], (len(tiles[0][0][0]), len(tiles[0][1][0])))
                index += (tile_size*i) + ((chosen_action+1)*(tile_size*tile_overlap))
                if trace == "accumulate":
                    eligibility[index] += 1
                elif trace == "replace":
                    eligibility[index] = 1
                else:
                    print("Unknown trace type")
                    break
                    
            # Take action and observe next state and reward
            reward = env.make_step(action=chosen_action)
            env_state = (env.position, env.velocity)
            delta = reward - evaluate_theta(theta, features, chosen_action)
            
            # Update delta from max Qa
            Q_actions = []
            for action in env.actions:
                features = find_features(tiles, env_state, action)
                Q_actions.append(evaluate_theta(theta, features, action))
            
            delta += (gamma * max(Q_actions))
            theta += (alpha * delta * eligibility)
            
            # Perform epsilon-greedy action
            chance = np.random.uniform(0,1)
            if (chance >= epsilon):
                Q_actions = []
                for action in env.actions:
                    features = find_features(tiles, env_state, action)
                    Q_actions.append(evaluate_theta(theta, features, action))
                # Find choose maximum action
                chosen_action = env.actions[np.argmax(Q_actions)]
                features = find_features(tiles, env_state, chosen_action)
                eligibility *= (gamma*lamb)

            else:
                chosen_action = np.random.choice(env.actions)
                eligibility *= 0
            
            total_reward += reward
            #env.plot()
        
        # Add total reward for episode to array (to allow plotting) and return
        episode_rewards.append(total_reward)
    
    return episode_rewards

Exemple #12

0

Afficher le fichier

Fichier : starter2.py Projet : paulvercoustre/Reinforcement-Learning

 def __init__(self, agent):
     self.mountain_car = mountaincar.MountainCar()
     self.agent = agent

Exemple #13

0

Afficher le fichier

    def __init__(self,
                 mountain_car=None,
                 eta=0.05,
                 gamma=0.95,
                 lam=0.8,
                 initial_epsilon=0.1,
                 initial_temperature=1.0,
                 neurons=10,
                 time=100,
                 dt=0.01,
                 actions=3,
                 n_steps=10000,
                 n_episodes=10,
                 run_type="Default",
                 explore_temp=False,
                 explore_lam=False,
                 explore_both=False,
                 explore_weights=False,
                 weights=.05,
                 greedy=False,
                 verbose=False):

        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        # Learning rate
        self.eta_ = eta
        # Reward Factor
        self.gamma_ = gamma
        # Decay Eligibility
        self.lambda_ = lam
        self.min_lambda_ = 0

        # Choice of Random Action or Not
        self.greedy_flag = greedy
        self.initial_epsilon_ = initial_epsilon

        # Exploration vs Exploitation parameter
        self.initial_temperature_ = initial_temperature
        self.min_temperature_ = 0.001

        # Neuron Centers
        self.neuron = neurons
        self.neuron_count = self.neuron**2
        _x_space_, self.x_centers_distance = np.linspace(-150,
                                                         30,
                                                         neurons,
                                                         retstep=True)
        _x_d_space_, self.phi_centers_distance = np.linspace(-15,
                                                             15,
                                                             neurons,
                                                             retstep=True)
        self.centers = np.array(list(itertools.product(_x_space_,
                                                       _x_d_space_)))
        self.x_sigma = self.x_centers_distance**2
        self.x_d_sigma = self.phi_centers_distance**2

        # Activity / State Parameters
        self.number_of_actions = actions
        self.activity = {"Right": 0, "Left": 1, "Neutral": 2}
        self.action_index_ = {"1": 0, "-1": 1, "0": 2}
        self.last_action = None
        self.action = 0
        self.old_state = None
        self.state = [self.mountain_car.x, self.mountain_car.x_d]
        self.old_index = None
        self.index = self._get_index(self.state)

        # Trace Memory
        self.e = np.zeros((self.neuron_count, actions))
        if not explore_weights:
            self.weights = np.random.rand(self.neuron_count, actions)
        if explore_weights:
            self.weights = np.ones((self.neuron_count, actions)) * weights

        # Time step for Simulation
        self.time = time
        self.dt = dt
        self.n_steps = n_steps
        self.n_episodes = n_episodes

        # Exploration
        self.explore_temp = explore_temp
        self.explore_lam = explore_lam
        self.explore_both = explore_both

        # Save Data
        save_data_name = datetime.now().strftime('%m-%d-%H.%M.%S')
        self.filename = "{0}-{1}s.hdf5".format(run_type, save_data_name)

        # Verbose Toggle
        self.verbose = verbose

Exemple #14

0

Afficher le fichier

Fichier : neuronalnetwork.py Projet : lukaspestalozzi/URLNN-Project2

    def train(
            self,
            n_steps,
            n_episodes,
            reward_factor,
            eligibility_decay,
            init_learning_rate,
            duration_learingrate,
            target_learning_rate,
            init_tau,
            duration_tau,
            target_tau,
            min_learning_rate=0.005,
            min_tau=0.01,  # must not be lower than 0.01
            step_penalty=-0.1,
            mountain_car=None,
            save_to_file=True,
            show_intermediate=False,
            show_trace=False,
            show_interactive=True,
            show_weights=False):
        """
        duration_*: positive integer. Determines at which episode the * parameter reaches it's minimum value. Note that the parameter continues to shrink when it reached the target_learning_rate value.
        min_*: spezifies a lower bound on the * parameter
        save_to_file: if True, then stores the NN after the training to a file. can be a string (directory where to store the NN)
        show_intermediate: if True, shows a plot all 100 episodes
        show_trace: if True, shows the trace of the car for each episode
        """

        #parameter checks
        assert init_tau > 0.0
        assert init_learning_rate != 0.0
        assert n_steps is None or n_steps > 0
        assert n_episodes > 0
        assert duration_tau > 0
        assert duration_learingrate > 0

        tau = float(init_tau)
        learning_rate = float(init_learning_rate)
        tau_update_factor = (target_tau / init_tau)**(1.0 / duration_tau)
        learning_rate_update_factor = (
            target_learning_rate / init_learning_rate)**(1.0 /
                                                         duration_learingrate)

        if mountain_car is None:
            mountain_car = mc.MountainCar()

        if n_steps is None:
            n_steps = float('inf')

        # init history
        self.history.append({
            'episodes': n_episodes,
            'steps': n_steps,
            'init_learning_rate': init_learning_rate,
            'duration_learingrate': duration_learingrate,
            'target_learning_rate': target_learning_rate,
            "min_learning_rate": min_learning_rate,
            'init_tau': init_tau,
            'duration_tau': duration_tau,
            'target_tau': target_tau,
            "min_tau": min_tau,
            'eligibility_decay': eligibility_decay,
            'step_penalty': step_penalty,
            'reward_factor': reward_factor,
            'eligibility_decay': eligibility_decay,
            'step_penalty': step_penalty,
            'sucess_indexes': [],
        })

        for ep in range(n_episodes):
            # run episode
            t = time()
            print("episode", ep, "/", n_episodes, "tau:", tau, "lrate:",
                  learning_rate)
            idx, trace = self._episode(mountain_car,
                                       learning_rate=learning_rate,
                                       reward_factor=reward_factor,
                                       eligibility_decay=eligibility_decay,
                                       n_steps=n_steps,
                                       step_penalty=step_penalty,
                                       tau=tau)
            self.history[-1]['sucess_indexes'].append(idx)
            print("  calc_t={:.4f}s".format(time() - t))

            # update tau and learning rate
            tau = max(min_tau, tau * tau_update_factor)
            learning_rate = max(min_learning_rate,
                                learning_rate * learning_rate_update_factor)
            #learning_rate = max(min_learning_rate, 1.0/np.sqrt(ep+44)) # sqrt

            t = time()
            # show some stuff
            if show_interactive:
                self.show_output(figure_name='activations_interactive',
                                 tau=tau,
                                 interactive=True)
                self.show_vector_field(figure_name='vector field interactive',
                                       tau=tau,
                                       interactive=True)
            if show_trace is True or (show_trace == 'not_succeeded'
                                      and idx > n_steps - 2):
                self.show_trace(figure_name='trace_interactive',
                                trace=trace,
                                interactive=True)
            if show_intermediate and ep % 100 == 99:
                self.show_output(figure_name='activations_' + str(ep),
                                 tau=tau,
                                 interactive=False)
                self.show_vector_field(figure_name='vector field' + str(ep),
                                       tau=tau,
                                       interactive=False)
            if show_weights is True:
                self.show_weights(figure_name="weights", interactive=True)
            if show_weights == 'intermediate' and ep % 1000 == 999:
                self.show_weights(figure_name="weights_" + str(ep),
                                  interactive=False)

            print("  plot_t={:.4f}s".format(time() - t))
        #end for episodes

        # save the NN
        if save_to_file is True:
            self._store_to_file()
        elif isinstance(save_to_file, str):
            self._store_to_file(path=save_to_file)

        # concatenate all previous success_indexes
        ret_si = []
        for h in self.history:
            ret_si += h['sucess_indexes']
        return ret_si

Exemple #15

0

Afficher le fichier

    def __init__(self,
                 mountain_car=None,
                 x_linspace=(-150, 30, 20),
                 v_linspace=(-15, 15, 20),
                 w=None,
                 tau=1,
                 gamma=0.95,
                 eta=0.001,
                 lambda_=0.95):
        ''' Initialize the object '''

        # saving the environment object
        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        # range for x neurons grid
        self.x_values = np.linspace(*x_linspace)

        # range for v neurons grid
        self.v_values = np.linspace(*v_linspace)

        # steps x and v
        self.delta_x = self.x_values[1] - self.x_values[0]
        self.delta_v = self.v_values[1] - self.v_values[0]

        # sigmas x and v
        self.sigma_x = np.array([self.delta_x] * len(self.x_values))
        self.sigma_v = np.array([self.delta_v] * len(self.v_values))

        # number of actions
        self.n_actions = 3

        # number of neurons
        self.n_neurons = len(self.x_values) * len(self.v_values)

        # weight matrix
        if w is None:
            #self.w = np.random.randn(self.n_actions, self.n_neurons)
            self.w = np.zeros((self.n_actions, self.n_neurons))
        else:
            self.w = np.copy(w)
            assert self.w.shape == (
                self.n_actions,
                self.n_neurons), "Please provide w with valid shape"

        # history of w
        self.w_history = []
        self.w_history.append(self.w)

        # history of escape latency
        self.escape_latency = []

        # sampling softmax temperature
        # can be a function from learning iteration
        self.tau = tau

        # setting tau
        if callable(self.tau):
            self.tau_func = self.tau
            self.tau = self.tau_func(0)

        # reward discount factor
        self.gamma = gamma

        # learning rate
        self.eta = eta

        # eligibility trace parameter
        self.lambda_ = lambda_

        # number of iterations learned
        self.learning_counter = 0

Exemple #16

0

Afficher le fichier

Fichier : starter.py Projet : cipri-tom/mountain-car

    def __init__(self,
                 mountain_car=None,
                 side_size=10,
                 tau=0.05,
                 x_range=(-150, 30),
                 v_range=(-15, 15),
                 weights=None,
                 eta=0.01,
                 gamma=0.95,
                 lambdaa=0.9):
        """ Makes a new agent with given parameters:
        Model:
            mountain_car : Instance of MountainCar
            side_size    : input neurons are arranged in a grid of this size -- scalar
            tau          : strategy exploration temperature -- scalar
            x_range      : range of positions to with input neurons -- 2-tuple
            v_range      : range of velocities to cover with input neurons -- 2-tuple
            weights      : from input neurons to output neurons -- array(3 x side_size x side_size)

        Learning:
            eta          : learning rate -- scalar << 1
            gamma        : future state discounting factor -- scalar (0.95 recommended)
            lambdaa      : eligibility decay rate -- scalar in (0,1)
        """

        if mountain_car is None:
            self.mountain_car = mountaincar.MountainCar()
        else:
            self.mountain_car = mountain_car

        if weights is None:
            self.weights = np.ones((3, side_size, side_size))
        else:
            self.weights = weights

        self.eligibility_trace = np.empty_like(self.weights)

        # neuron preference centres, widths:
        self.centres_x, self.sigma_x = np.linspace(x_range[0],
                                                   x_range[1],
                                                   side_size,
                                                   endpoint=True,
                                                   retstep=True)
        self.centres_v, self.sigma_v = np.linspace(v_range[0],
                                                   v_range[1],
                                                   side_size,
                                                   endpoint=True,
                                                   retstep=True)

        # we transpose one of the dimensions so that it will be broadcasted nicely with the other
        self.centres_x = np.atleast_2d(self.centres_x)
        self.centres_v = np.atleast_2d(self.centres_v).T

        # we always use sigma**2 in our calculations, so save that one instead
        self.sigma_x = self.sigma_x**2
        self.sigma_v = self.sigma_v**2

        # save the rest of the params
        self.tau = tau
        self.eta = eta
        self.gamma = gamma
        self.lambdaa = lambdaa
        self.side_size = side_size

        # number of steps used per episode
        self.escape_times = []