Beispiel #1
0
 def __init__(self, q, alpha, reward, discount, initial_state, actions):
     self.q = {}
     self.alpha = alpha
     self.reward = reward
     self.discount = discount
     self.states = initial_state
     QLearn.__init__(self, actions, len(initial_state), alpha)
Beispiel #2
0
 def __init__(self, q, alpha, reward, discount, initial_state, actions):
     self.q = {}
     self.alpha = alpha
     self.reward = reward
     self.discount = discount
     self.states = initial_state
     QLearn.__init__(self, actions, len(initial_state), alpha)
def load_model(actions, input_dir, circuit, experiment, number):

    path = os.path.join(input_dir, circuit, experiment, number)
    q_table_path_file = os.path.join(path, sorted(os.listdir(path))[0])

    qlearn_file = open(os.path.join(q_table_path_file))
    model = pickle.load(qlearn_file)

    qlearn = QLearn(actions=actions, alpha=0.2, gamma=0.9, epsilon=0.05)
    qlearn.q = model

    print(
        "\n\n---------------- MODEL LOADED ----------------\n {}\n-----------------------\n\n"
        .format(qlearn_file))

    return qlearn
Beispiel #4
0
 def __init__(self, orderbook, side, T, I, ai=None, levels=None):
     self.orderbook = orderbook
     self.side = side
     self.levels = levels
     if not ai:
         ai = QLearn(self.levels)  # levels are our qlearn actions
     self.ai = ai
     self.T = T
     self.I = I
Beispiel #5
0
    def __init__(self,
                 policy="greedy",
                 lvfa=False,
                 dim=7,
                 nA=7,
                 nS=78125,
                 epsilon=0.05,
                 alpha=0.01,
                 gamma=0.9,
                 ellgibility_trace=True,
                 Q=None):

        self.policy_class = Policy(nA=nA, epsilon=epsilon)

        if policy == "greedy":
            self.policy = self.policy_class.greedy_policy

        if policy == "eps_greedy":
            self.policy = self.policy_class.eps_policy

        if policy == "softmax":
            self.policy = self.policy_class.softmax_policy

        self.reset_ellgibility_trace = self.do_nothing

        if lvfa:
            self.agent = LVFA(dim=dim,
                              nA=nA,
                              nS=nS,
                              alpha=alpha,
                              gamma=gamma,
                              policy=self.policy)
            self.learn = self.agent.learn
            self.chooseAction = self.agent.chooseAction
            self.save_model = self.agent.save_model
            self.load_model = self.agent.save_model
        else:
            self.agent = QLearn(nA=nA,
                                nS=nS,
                                epsilon=epsilon,
                                alpha=alpha,
                                gamma=gamma,
                                Q=Q,
                                policy=self.policy)
            self.chooseAction = self.agent.chooseAction
            self.save_model = self.agent.save_model
            self.load_model = self.agent.save_model

            if ellgibility_trace:
                self.learn = self.agent.learn_ellgibility_trace
                self.reset_ellgibility_trace = self.agent.reset_ellgibility_trace
            else:
                self.learn = self.agent.learn
 def testStateEquality(self):
     ai = QLearn([-1, 0, 1])
     a1 = ActionState(1.0, 1.0, {'vol60': 1})
     a2 = ActionState(1.0, 1.0, {'vol60': 1})
     ai.learn(a1, 1, 1.0, a2)
     self.assertEqual(ai.getQAction(a2), 1)
import unittest
from qlearn import QLearn
from action_state import ActionState
import numpy as np


class QlearnTest(unittest.TestCase):
    def testStateEquality(self):
        ai = QLearn([-1, 0, 1])
        a1 = ActionState(1.0, 1.0, {'vol60': 1})
        a2 = ActionState(1.0, 1.0, {'vol60': 1})
        ai.learn(a1, 1, 1.0, a2)
        self.assertEqual(ai.getQAction(a2), 1)

    #def testQTableLookup(self):
actions = [5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -7, -10, -15, -20]
ai = QLearn(actions)
ai.q = np.load('test_q.npy').item()
ai.q
state = ActionState(30, 0.9, {})
ai.q.get((state, -10))
print(ai.getQAction(state))
Beispiel #8
0
    states_reward = {}

    env = gym.wrappers.Monitor(env, outdir, force=True)
    plotter = liveplot.LivePlot(outdir)

    last_time_steps = np.ndarray(0)

    actions = range(env.action_space.n)

    counter = 0
    estimate_step_per_lap = environment["estimated_steps"]
    lap_completed = False
    total_episodes = 20000
    epsilon_discount = 0.9986  # Default 0.9986

    qlearn = QLearn(actions=actions, alpha=0.8, gamma=0.9, epsilon=0.99)

    if settings.load_model:
        # file_name = 'qlearn_camera_solved/montreal/2/1_20200928_2303_act_set_simple_epsilon_0.87_QTABLE.pkl'
        file_name = 'qlearn_camera_solved/points_1_actions_simple__simple_circuit/4/1_20200921_2024_act_set_simple_epsilon_0.83_QTABLE.pkl'
        load_model(qlearn, file_name)
        highest_reward = max(qlearn.q.values(), key=stats.get)
    else:
        highest_reward = 0
    initial_epsilon = qlearn.epsilon

    telemetry_start_time = time.time()
    start_time = datetime.datetime.now()
    start_time_format = start_time.strftime("%Y%m%d_%H%M")

    print(settings.lets_go)
Beispiel #9
0
# the following actions represent an offset over the existing state,
#  modifications over the traditional "step" method of the environment
#  are implemented in the "step" function
actions = [(difference_bins, 0.0, 0.0), (-difference_bins, 0.0, 0.0),
           (0.0, difference_bins, 0.0), (0.0, -difference_bins, 0.0),
           (0.0, 0.0, difference_bins), (0.0, 0.0, -difference_bins),
           (0.0, 0.0, 0.0)]
############

# The Q-learn algorithm
#qlearn = QLearn(actions=actions,
#    alpha=0.2, gamma=0.90, epsilon=0.5, epsilon_decay_rate=0.99)

qlearn = QLearn(actions=actions,
                alpha=0.2,
                gamma=0.90,
                epsilon=0.1,
                epsilon_decay_rate=0.98)

for i_episode in range(30):  # episodes

    print("I_EPISODE", i_episode)  #####
    observation = env.reset()

    joint1_position, joint2_position, joint3_position = observation[:3]
    state = build_state([
        to_bin(joint1_position, joint1_bins),
        to_bin(joint2_position, joint2_bins),
        to_bin(joint3_position, joint3_bins)
    ])
Beispiel #10
0
    else:
        y = [np.mean(np.array(x)) for x in ys]
        y2 = [np.mean(np.array(x)) for x in ys2]

    plt.plot(x, y, 'r-')
    if enable_after_exec_return:
        plt.plot(x, y2, 'g-')
    plt.grid(linestyle='-', linewidth=2)
    plt.show()


#logging.basicConfig(level=logging.DEBUG)

side = OrderSide.BUY
levels = [5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -10, -12, -15]
ai = QLearn(actions=levels, epsilon=0.4, alpha=0.3, gamma=0.8)

#trainBook = 'query_result_train_15m.tsv'
#testBook = 'query_result_train_15m.tsv'

# orderbook = Orderbook(extraFeatures=False)
# orderbook.loadFromBitfinexFile('orderbook_bitfinex_btcusd_view.tsv')
# orderbook_test = Orderbook(extraFeatures=False)
# orderbook_test.loadFromBitfinexFile('orderbook_bitfinex_btcusd_view.tsv')

# Load orderbook
cols = ["ts", "seq", "size", "price", "is_bid", "is_trade", "ttype"]
import pandas as pd
events = pd.read_table('ob-1-small.tsv', sep='\t', names=cols, index_col="seq")
d = Orderbook.generateDictFromEvents(events)
orderbook = Orderbook()
Beispiel #11
0
    parser.add_argument('--algo',
                        choices=['qlearn', 'sarsa', 'esarsa'],
                        default='qlearn')


parser = ArgumentParser()
algo_args(parser)
args = parser.parse_args()

for eps in [0.05, 0.2]:
    fname = '{}-{}.csv'.format(args.algo, eps)
    fpath = os.path.join("exps", fname)
    with open(fpath, "w+") as fp:
        total_rewards = 0
        options = {
            "qlearn": lambda: QLearn(eps=eps),
            "sarsa": lambda: SARSA(eps=eps),
            "esarsa": lambda: ExpectedSARSA(eps=eps)
        }
        algo = options.get(args.algo)()

        for episode in range(10000):
            grid = GridWorld()
            agent = Agent()
            s = agent.position()
            actions = [(0, 1), (1, 0), (-1, 0), (0, -1)]
            a = random.choice(actions)
            episode_reward = 0

            def step(s, a):
                s_ = grid.move(s, a)
Beispiel #12
0
class AgentQlearn:
    def __init__(self, env):
        self.env = env
        self.levels = levels
        self.ai = QLearn(self.levels)

    def update(self, t, i, force_execution=False):
        aiState = ActionState(t, i)
        a = self.ai.chooseAction(aiState)
        # print('Random action: ' + str(level) + ' for state: ' + str(aiState))
        action = self.env.createAction(level=a,
                                       state=aiState,
                                       force_execution=force_execution)
        action.run(self.env.orderbook)
        i_next = self.env.determineNextInventory(action)
        t_next = self.env.determineNextTime(t)
        reward = action.getReward()
        state_next = ActionState(action.getState().getT(),
                                 action.getState().getI(),
                                 action.getState().getMarket())
        state_next.setT(t_next)
        state_next.setI(i_next)
        #print("Reward " + str(reward) + ": " + str(action.getState()) + " with " + str(action.getA()) + " -> " + str(state_next))
        self.ai.learn(state1=action.getState(),
                      action1=action.getA(),
                      reward=reward,
                      state2=state_next)
        return (t_next, i_next)

    def train(self, episodes=1, force_execution=False):
        for episode in range(int(episodes)):
            for t in self.env.T:
                logging.info("\n" + "t==" + str(t))
                for i in self.env.I:
                    logging.info("     i==" + str(i))
                    logging.info("Action run " + str((t, i)))
                    (t_next, i_next) = self.update(t, i, force_execution)
                    while i_next != 0:
                        if force_execution:
                            raise Exception("Enforced execution left " +
                                            str(i_next) + " unexecuted.")
                        logging.info("Action transition " + str((t, i)) +
                                     " -> " + str((t_next, i_next)))
                        (t_next, i_next) = self.update(t_next, i_next,
                                                       force_execution)

    def backtest(self, q=None, episodes=10, average=False, fixed_a=None):
        if q is None:
            q = self.ai.q
        else:
            self.ai.q = q

        if not q:
            raise Exception('Q-Table is empty, please train first.')

        Ms = []
        #T = self.T[1:len(self.T)]
        for t in [self.env.T[-1]]:
            logging.info("\n" + "t==" + str(t))
            for i in [self.env.I[-1]]:
                logging.info("     i==" + str(i))
                actions = []
                state = ActionState(t, i, {})
                #print(state)
                if fixed_a is not None:
                    a = fixed_a
                else:
                    try:
                        a = self.ai.getQAction(state, 0)
                        print("t: " + str(t))
                        print("i: " + str(i))
                        print("Action: " + str(a))
                        # print("Q action for state " + str(state) + ": " + str(a))
                    except:
                        # State might not be in Q-Table yet, more training requried.
                        logging.info("State " + str(state) +
                                     " not in Q-Table.")
                        break
                actions.append(a)
                action = self.env.createAction(level=a,
                                               state=state,
                                               force_execution=False)
                midPrice = action.getReferencePrice()

                #print("before...")
                #print(action)
                action.run(self.env.orderbook)
                #print("after...")
                #print(action)
                i_next = self.env.determineNextInventory(action)
                t_next = self.env.determineNextTime(t)
                # print("i_next: " + str(i_next))
                while i_next != 0:
                    state_next = ActionState(t_next, i_next, {})
                    if fixed_a is not None:
                        a_next = fixed_a
                    else:
                        try:
                            a_next = self.ai.getQAction(state_next, 0)
                            print("t: " + str(t_next))
                            print("i: " + str(i_next))
                            print("Action: " + str(a_next))
                            # print("Q action for next state " + str(state_next) + ": " + str(a_next))
                        except:
                            # State might not be in Q-Table yet, more training requried.
                            # print("State " + str(state_next) + " not in Q-Table.")
                            break
                    actions.append(a_next)
                    #print("Action transition " + str((t, i)) + " -> " + str(aiState_next) + " with " + str(runtime_next) + "s runtime.")

                    runtime_next = self.env.determineRuntime(t_next)
                    action.setState(state_next)
                    action.update(a_next, runtime_next)
                    action.run(self.env.orderbook)
                    #print(action)
                    i_next = self.env.determineNextInventory(action)
                    t_next = self.env.determineNextTime(t_next)

                price = action.getAvgPrice()
                # TODO: last column is for for the BUY scenario only
                if action.getOrder().getSide() == OrderSide.BUY:
                    profit = midPrice - price
                else:
                    profit = price - midPrice
                Ms.append([state, midPrice, actions, price, profit])
        if not average:
            return Ms
        return self.averageBacktest(Ms)

    def averageBacktest(self, M):
        # Average states within M
        N = []
        observed = []
        for x in M:
            state = x[0]
            if state in observed:
                continue
            observed.append(state)
            paid = []
            reward = []
            for y in M:
                if y[0] == state:
                    paid.append(y[3])
                    reward.append(y[4])
            N.append([state, x[1], x[2], np.average(paid), np.average(reward)])
        return N

    def run(self, epochs_train=1, epochs_test=10):
        if epochs_train > 0:
            agent.train(episodes=epochs_train)
        M = agent.backtest(episodes=epochs_test, average=False)
        M = np.array(M)
        return np.mean(M[0:, 4])

    def simulate(self, epochs_train=1, epochs_test=10, interval=100):
        from agent_utils.ui import UI
        UI.animate(lambda: self.run(epochs_train, epochs_test),
                   interval=interval)
Beispiel #13
0
 def __init__(self, env):
     self.env = env
     self.levels = levels
     self.ai = QLearn(self.levels)
Beispiel #14
0
                         bins=n_bins,
                         retbins=True)[1][1:-1]
joint3_bins = pandas.cut([-numpy.pi / 2, numpy.pi / 2],
                         bins=n_bins,
                         retbins=True)[1][1:-1]

# print("joint1_bins: ", joint1_bins)

# Generate posible actions
# TODO program this
# actions = [item for innerlist in outerlist ]
actions = [(0.0, 0.0, 0.0), (numpy.pi / 2, numpy.pi / 2, numpy.pi / 2),
           (0, 0, numpy.pi / 2)]

# The Q-learn algorithm
qlearn = QLearn(actions=actions, alpha=0.5, gamma=0.90, epsilon=0.1)

for i_episode in range(30):  # episodes
    observation = env.reset()

    joint1_position, joint2_position, joint3_position = observation[:3]
    state = build_state([
        to_bin(joint1_position, joint1_bins),
        to_bin(joint2_position, joint2_bins),
        to_bin(joint3_position, joint3_bins)
    ])

    for t in range(max_number_of_steps):
        env.render()

        # Pick an action based on the current state
Beispiel #15
0
from maze_generator import MazeEnv, read_maze
from value_iteration import value_iteration
from policy_iteration import policy_improvement
import numpy as np
from mdp_graph import graph_value_policy
import matplotlib.pyplot as plt
from time import time
from qlearn import QLearn
import time

maze_shape = (32, 32)
maze_file = 'maze/mazeLarge.png'
# p = 0.1
qlearn = QLearn(num_states=32 * 32,
                num_actions=4,
                alpha=0.2,
                gamma=0.99,
                epsilon=0.1,
                softmax=True)

env = MazeEnv(maze_file=maze_file)

total_reward_hist = []
cum_total_reward = 0
q_start_hist = []
episodes = 10000
for e in range(episodes):
    done = False
    obs = env.reset()
    if (e % 100 == 0):
        print("Episode {}".format(e))
Beispiel #16
0
###########################################
# Debug Q Values from QLearningMouse
#
# Curtis Long 20190221
###########################################

f = open("resources/world.txt", 'r')
lines = f.readlines()
f.close()

height = len(lines)
width = max([len(x) for x in lines])

ai = QLearn(actions=range(cfg.directions),
            alpha=cfg.alpha,
            gamma=cfg.gamma,
            epsilon=cfg.epsilon)
if (os.path.isfile('mouse.pickle')):
    with open('mouse.pickle', 'rb') as p:
        ai.q = pickle.load(p)

pprint(ai.q)
print('Items: ' + str(len(ai.q)))
#exit()

dirs = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]
actions = range(cfg.directions)
i = 0
j = 0
for line in lines:
    print("\n", end='')
Beispiel #17
0
    notrl_tot_steps = 0
    notrl_returns = []
    notrl_steps = []

    # create grid-world instance
    grid = GridWorld(9)
    grid.make_maps()

    possible_actions = grid.possible_actions
    world = grid.world
    grid.list_of_maps.reverse()

    # Direct learning on final grid
    print("Direct learning on final grid")
    qlearn = QLearn(grid.final_grid, possible_actions, world)
    Q, returns, episodes, steps = do_task(
        qlearn, grid, len(grid.list_of_maps) - 1)
    notrl_returns.append(returns)
    notrl_steps.append(steps)
    notrl_tot_steps += steps[-1]
    print("-" * 80)

    # Incremental transfer learning
    print("Incremental transfer learning")
    Q = None
    for task, current_map in enumerate(grid.list_of_maps):
        print("-" * 50)
        # creates QLearn instance
        exploit = False if task == 0 else True
        qlearn = QLearn(current_map, possible_actions, world, Q)
Beispiel #18
0
from maze_generator import WindyMazeEnv, read_maze
from value_iteration import value_iteration
from policy_iteration import policy_improvement
import numpy as np
from mdp_graph import graph_value_policy
import matplotlib.pyplot as plt
from time import time
from qlearn import QLearn
import time

maze_shape = (16, 16)
maze_file = 'maze/maze.png'
p = 0.1
qlearn = QLearn(num_states=16 * 16, num_actions=4, alpha=0.1, gamma=0.9, epsilon=0.1)

env = WindyMazeEnv(maze_file=maze_file, wind_prob=p)

total_reward_hist = []
cum_total_reward = 0
q_start_hist = []
episodes = 35000
for e in range(episodes):
    done = False
    obs = env.reset()
    if (e % 1000 == 0):
        print("Episode {}".format(e))

    gamma = 0.9
    gamma_pow = 1
    total_reward = 0
Beispiel #19
0
def main(iteration):
    world = 4

    # saving directories
    window = 5  # moving mean window
    main_dir = 'qlearn_plots'
    sub_dir = ['4by4can', '4by4nocan', '9by9']
    sub_sub_dir = ['steps', 'episodes']

    for sub_d in sub_dir:
        for ss_d in sub_sub_dir:
            dir_name = '/'.join([main_dir, sub_d, 'win' + str(window), ss_d])
            if not os.path.exists(dir_name):
                os.makedirs(dir_name)

    # print("-" * 100)

    # Evaluation
    tot_steps = 0
    all_returns = []
    all_steps = []
    all_episodes = []

    notrl_tot_steps = 0
    notrl_returns = []
    notrl_steps = []
    notrl_episodes = []

    # create grid-world instance
    if world == 4:
        canyon = False
        grid = GridWorld(world, canyon)
        if canyon:
            canyon_str = "(CANYON)"
        else:
            canyon_str = "(NO CANYON)"
    elif world == 9:
        canyon_str = ''
        grid = GridWorld(9)
    grid.make_maps()

    possible_actions = grid.possible_actions
    grid.list_of_maps.reverse()

    # Direct learning on final grid
    # print("Direct learning on final grid")
    qlearn = QLearn(grid.final_grid, possible_actions, world)
    Q, returns, episodes, steps = do_task(qlearn, grid,
                                          len(grid.list_of_maps) - 1)
    notrl_returns.append(returns)
    notrl_steps.append(steps)
    notrl_episodes.append(episodes)
    notrl_tot_steps += steps[-1]
    # print("-" * 80)

    # Incremental transfer learning
    # print("Incremental transfer learning", canyon_str)
    Q = None
    for task, current_map in enumerate(grid.list_of_maps, 0):
        # print("-" * 50)
        # creates qlearn instance
        exploit = False if task == 0 else False
        qlearn = QLearn(current_map, possible_actions, world, Q)
        Q, returns, episodes, steps = do_task(qlearn, grid, task, exploit)
        all_returns.append(returns)
        tot_counter = 0
        epi_counter = 0
        if task != 0:
            tot_counter += all_steps[task - 1][-1]
            epi_counter += all_episodes[task - 1][-1]
            all_steps.append([i + tot_counter for i in steps])
            all_episodes.append([i + epi_counter for i in episodes])
        else:
            all_steps.append([i for i in steps])
            all_episodes.append([i for i in episodes])
    # print("-" * 100)

    # print("Incremental Transfer Cumulative total of steps",
    # all_steps[-1][-1] - all_steps[0][-1])
    # print("Direct Cumulative total of steps", notrl_steps[-1][-1])

    flat_episodes = [item for sublist in all_episodes for item in sublist]
    flat_returns = [item for sublist in all_returns for item in sublist]
    flat_steps = [item for sublist in all_steps for item in sublist]
    tmp_array = np.array(flat_returns)
    notrl_avg_returns = []
    avg_returns = []
    for t in range(len(flat_returns)):
        avg_returns.append(tmp_array[max(0, t - window):(t + 1)].mean())
    notrl_flat_returns = [
        item for sublist in notrl_returns for item in sublist
    ]
    tmp_array_1 = np.array(notrl_flat_returns)
    for t in range(len(notrl_flat_returns)):
        notrl_avg_returns.append(tmp_array_1[max(0, t - window):(t +
                                                                 1)].mean())

    fig = plt.figure()
    a0 = fig.add_subplot(1, 1, 1)
    val = 0
    for j, i in enumerate(all_steps):
        if j == len(all_steps) - 1:
            a0.axvline(x=i[-1],
                       linestyle='--',
                       color='#ccc5c6',
                       label='Task Switch')
        else:
            a0.axvline(x=i[-1], linestyle='--', color='#ccc5c6')
    a0.plot(flat_steps,
            avg_returns,
            label="Task Interpolation",
            color='#d73236',
            linewidth=1,
            linestyle='-')
    x_steps = [
        i + all_steps[0][-1] - notrl_steps[0][0] for i in notrl_steps[0]
    ]
    a0.plot(x_steps,
            notrl_avg_returns,
            label="Tabula Rasa",
            color='#80bbe5',
            linestyle='-',
            linewidth=1)
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
    plt.xlabel("Steps")
    plt.ylabel("Accumulated Reward")
    plt.legend(loc="lower right")
    plt.axis([None, None, -20, 1])
    if world == 4:
        if canyon:
            step_save = 'qlearn_plots/4by4can/' + 'win' + str(
                window) + '/steps/4by4_canyon_steps'
            plt_title = '4x4 Maze Canyon'
        else:
            step_save = 'qlearn_plots/4by4nocan/' + 'win' + str(
                window) + '/steps/4by4_nocanyon_steps'
            plt_title = '4x4 Maze Non-Canyon'
    elif world == 9:
        step_save = 'qlearn_plots/9by9/' + 'win' + str(
            window) + '/steps/9by9_steps'
        plt_title = '9x9 Maze'
    plt.title(plt_title)
    plt.savefig(step_save + iteration + '.eps', format='eps', dpi=1000)
    # fig.show()

    fig1 = plt.figure()
    a1 = fig1.add_subplot(1, 1, 1)
    val = 0
    for j, i in enumerate(all_episodes):
        if j == len(all_episodes) - 1:
            a1.axvline(x=i[-1],
                       linestyle='--',
                       color='#ccc5c6',
                       label='Task Switch')
        else:
            a1.axvline(x=i[-1], linestyle='--', color='#ccc5c6')
    a1.plot(flat_episodes,
            avg_returns,
            label="Task Interpolation",
            color='#d73236',
            linewidth=1,
            linestyle='-')
    x_episodes = [
        i + all_episodes[0][-1] - notrl_episodes[0][0]
        for i in notrl_episodes[0]
    ]
    a1.plot(x_episodes,
            notrl_avg_returns,
            label="Tabula Rasa",
            color='#80bbe5',
            linestyle='-',
            linewidth=1)
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
    plt.xlabel("Episodes")
    plt.ylabel("Accumulated Reward")
    plt.legend(loc="lower right")
    plt.axis([None, None, -20, 1])
    plt.title(plt_title)
    if world == 4:
        if canyon:
            epi_save = 'qlearn_plots/4by4can/' + 'win' + str(
                window) + '/episodes/4by4_canyon_episodes'
        else:
            epi_save = 'qlearn_plots/4by4nocan/' + 'win' + str(
                window) + '/episodes/4by4_nocanyon_episodes'
    elif world == 9:
        epi_save = 'qlearn_plots/9by9/' + 'win' + str(
            window) + '/episodes/9by9_episodes'
    plt.savefig(epi_save + iteration + '.eps', format='eps', dpi=1000)