def policy_iteration(): env = Environment() env.load(2018) # env.load_demo() agent = PolicyBasedTrader(policy=None, env=env) s_S = agent.states_space_size s_A = agent.actions_space_size v = load_model() v = v if v is not None else np.zeros(s_S) # value function p = np.full(s_S, IDLE_ACTION_INDEX) # initial policy should be valid gamma = 1 # discount factor EPOCHS = 1000 period = 5 data = [] print(f'States space size is {s_S}') print(f'Actions space size is {s_A}') print(f'Max epochs to run {EPOCHS}') theta = 0.05 # convergence check t1 = timeit.default_timer() for i in range(EPOCHS): t2 = timeit.default_timer() dt = format_timespan(t2 - t1) sys.stdout.write(f'\rIteration {i}/{EPOCHS}... {dt} passed') sys.stdout.flush() while True: # policy evaluation delta = 0 for s in range(s_S): v_ = v[s] v[s] = get_new_value_for_state(agent, s, v, p, gamma=gamma) delta = max(delta, np.abs(v_ - v[s])) # print(delta) if delta < theta: break policy_stable = True # policy improvement for s in range(s_S): action = p[s] p[s] = get_max_action_for_state(agent, s, v, gamma=gamma) if action != p[s]: policy_stable = False if i % period == 0: save_model(v) if policy_stable: print(f'\nFound stable policy on iteration {i}!') print( '\nSaving resulting model to a file {}'.format(MODEL_FILENAME)) save_model(v) break print_value_function(v) return p
def main(): v = load_model() env = Environment() env.load(2018) # env.load_demo() agent = PolicyBasedTrader(policy=None, env=env, verbose=False) print(f'Total states: {agent.states_space_size}') # policy = extract_policy(agent, v) with open(POLICY_FILENAME, 'rb') as f: policy = np.load(f) print('Count actions') c = Counter() agent = PolicyBasedTrader(policy=None, env=env, verbose=True) min_amount_uah = 0 for step in range(env.size): state = agent.to_state(step, agent.amount_usd) action = policy[state] c[action] += 1 agent.take_action(action, state) min_amount_uah = min(agent.amount_uah, min_amount_uah) for i, action in enumerate(ACTIONS): print(action, '->', c.get(i)) print('min amount uah', min_amount_uah) print(c)
def value_iteration(plot_chart=False): env = Environment() env.load(2018) # env.load_demo() agent = PolicyBasedTrader(policy=None, env=env) s_S = agent.states_space_size s_A = agent.actions_space_size v = load_model() v = v if v is not None else np.zeros(s_S) gamma = 1 # undiscounted return for the whole episode EPOCHS = 20 period = 5 data = [] print(f'States space size is {s_S}') print(f'Actions space size is {s_A}') print(f'Max epochs to run {EPOCHS}') theta = 0.05 # convergence check t1 = timeit.default_timer() for i in range(EPOCHS): delta = 0 t2 = timeit.default_timer() dt = format_timespan(t2 - t1) sys.stdout.write(f'\rIteration {i}/{EPOCHS}... {dt} passed') sys.stdout.flush() for s in range(s_S): v_ = v[s] actions_outcomes = get_outcomes_for_state(agent, s, v, gamma=gamma) v[s] = max(actions_outcomes) delta = max(delta, np.abs(v_ - v[s])) if i % period == 0: save_model(v) if delta < theta: print(f'\nValue function converged in {i} iterations') print( '\nSaving resulting model to a file {}'.format(MODEL_FILENAME)) save_model(v) break print_value_function(v) print('=' * 80) print('Extracting deterministic policy, pi') policy = extract_policy(agent, v) print(policy) return policy
def evaluate_policy(policy): env = Environment() env.load(2018) # env.load_demo() agent = PolicyBasedTrader(policy=None, env=env, verbose=True) for step in range(env.size): state = agent.to_state(step, agent.amount_usd) action = policy[state] agent.take_action(action, state) print('End amount UAH: {:.2f}'.format(agent.amount_uah)) print('End amount USD: {:.2f}'.format(agent.amount_usd)) print('Profit in UAH: {:.2f}'.format(agent.profit)) exit_uah = agent.amount_usd * env.get_observation(env.size - 1).rate_buy exit_amount = agent.amount_uah + exit_uah print('Amount on exit now: {:.2f}'.format(exit_amount)) return agent.profit
def evaluate_agent(): env = Environment() env.load(2018) agent = PolicyBasedTrader(policy=None, env=env) model = load_model() if model is None: raise RuntimeError('Train agent first, no model to load') policy = extract_policy(agent, model) for step in range(env.size): state = agent.to_state(step, agent.amount_usd) action = policy[state] agent.take_action(action, state) print('End amount UAH: {:.2f}'.format(agent.amount_uah)) print('End amount USD: {:.2f}'.format(agent.amount_usd)) print('Profit in UAH: {:.2f}'.format(agent.profit)) exit_uah = agent.amount_usd * env.get_observation(env.size - 1).rate_buy exit_amount = agent.amount_uah + exit_uah print('Amount on exit now: {:.2f}'.format(exit_amount)) return agent.profit
def sarsa(play=False, plot_chart=False): env = Environment() # load smaller environment for just one month env.load(2018) agent = PolicyBasedTrader(policy=None, env=env) s_S = agent.states_space_size s_A = agent.actions_space_size print(f'States space size is {s_S}') print(f'Actions space size is {s_A}') print(f'Steps in environment is {env.size}') alpha = 1 # learning rate, discard old results immediately gamma = 1 # discount factor # load model from a file if saved previously model = SarsaModel.load() if play else None Q = model.Q if model is not None else np.zeros(shape=(s_S, s_A)) if model is not None: print(f'Resuming with eps={model.eps}') min_eps = 0.01 # eps = 0.1 # start with exploration eps = model.eps if model is not None else 0.1 max_eps = 1.0 decay_rate = 0.01 EPOCHS = 100 period = 5 data = [] print(f'Running {EPOCHS} epochs\n') lock = Lock() def run_iterations(worker_num=0): # print(f'#{worker_num}: Running {EPOCHS} iterations in worker') nonlocal eps progress = tqdm.tqdm( desc='#{:02d}'.format(worker_num), position=worker_num, total=EPOCHS, leave=False, ) for i in range(EPOCHS): Q_copy = Q.copy() # do not lock, just evaluate on a recent copy if i % period == 0: # print(f'#{worker_num}: Evaluating agent on {i} iteration...') fitness = evaluate_q(env, Q_copy) data.append(fitness) # reset env for each epoch agent = PolicyBasedTrader(policy=None, env=env) s = 0 # starting state a = get_next_action(agent, Q_copy, s, eps=eps) # print(f'#{worker_num}: Rollout for epoch {i}') while s is not None: # rollout r, s_ = agent.take_action(a, s) with lock: if s_ is not None: a_ = get_next_action(agent, Q, s_, eps=eps) q_update = alpha * (r + gamma*Q[s_, a_] - Q[s, a]) else: q_update = alpha * (r - Q[s, a]) a_ = None Q[s, a] += q_update s = s_ a = a_ eps = min_eps + (max_eps - min_eps)*np.exp(-decay_rate*i) progress.update() progress.close() return worker_num fig, ax = plt.subplots(figsize=(6, 4)) fig.canvas.set_window_title('Agent evaluation') def build_live_chart(i): local_data = data[::] datax = np.arange(0, period*len(local_data), period) plt.xlabel('Iterations') plt.ylabel('Fitness') plt.title('Learning curve') ax.clear() ax.plot(datax, local_data, 'b', label='Score') ax.legend() ax.grid(True) workers = 1 with ThreadPoolExecutor(max_workers=workers) as executor: futures = [ executor.submit(run_iterations, i) for i in range(workers) ] ani = animation.FuncAnimation(fig, build_live_chart, interval=500) plt.show() result = concurrent.futures.wait(futures) plt.close() assert len(result.done) == workers # Save latest data if not play: model = SarsaModel(Q=Q, eps=eps) model.save() print('\nDone!') policy = extract_policy(agent, Q) if plot_chart: build_evaluation_chart(data, period=period) return policy
def q_learning(plot_chart=False): env = Environment() env.load(2018) agent = PolicyBasedTrader(policy=None, env=env) s_S = agent.states_space_size s_A = agent.actions_space_size print(f'States space size is {s_S}') print(f'Actions space size is {s_A}') alpha = 0.2 # learning rate gamma = 1 # discount factor eps = 0.8 # exploration factor, higher - more exploration model = load_model() Q = model if model is not None else np.zeros(shape=(s_S, s_A)) EPOCHS = 500 period = 5 data = [] lock = Lock() def run_iterations(worker_num=0): print(f'#{worker_num}: Running {EPOCHS} iterations in worker') for i in range(EPOCHS): Q_copy = Q.copy() # do not lock, just evaluate on a recent copy if i % period == 0: print(f'#{worker_num}: Evaluating agent on {i} iteration...') fitness = evaluate_q(env, Q_copy) print(f'#{worker_num}: Current fitness: {fitness:.2f}') data.append(fitness) # reset env for each epoch agent = PolicyBasedTrader(policy=None, env=env) s = 0 # starting state print(f'#{worker_num}: Rollout for epoch {i}') while s is not None: # rollout # do not allow other threads to update Q within a single step with lock: a = get_next_action(agent, Q, s, eps) r, s_ = agent.take_action(a, s) # maximize Q for the next state max_q = maximize_q(agent, Q, s_) Q[s, a] = alpha * (r + gamma * max_q - Q[s, a]) s = s_ workers = 8 with ThreadPoolExecutor(max_workers=workers) as executor: futures = { executor.submit(run_iterations, i): i for i in range(workers) } for future in concurrent.futures.as_completed(futures): worker_num = futures[future] try: r = future.result() print(f'#{worker_num}: Finished!') except Exception as e: print(f'#{worker_num}: Failed with {e}') save_model(Q) policy = extract_policy(agent, Q) if plot_chart: build_evaluation_chart(data, period=period) return policy
def sarsa(play=False, plot_chart=False): env = Environment() # load smaller environment for just one month env.load(2018) agent = PolicyBasedTrader(policy=None, env=env) s_S = agent.states_space_size s_A = agent.actions_space_size print(f'States space size is {s_S}') print(f'Actions space size is {s_A}') print(f'Steps in environment is {env.size}') alpha = 1 # learning rate, discard old results immediately gamma = 1 # discount factor # load model from a file if saved previously model = SarsaModel.load() if play else None Q = model.Q if model is not None else np.zeros(shape=(s_S, s_A)) if model is not None: print(f'Resuming with eps={model.eps}') min_eps = 0.01 eps = 0.1 # start with exploration eps = model.eps if model is not None else 0.1 max_eps = 1.0 decay_rate = 0.05 best_fitness = -np.inf EPOCHS = 2000 period = 5 data = [] fig, ax = plt.subplots(figsize=(6, 4)) fig.canvas.set_window_title('Agent evaluation') def build_live_chart(i): window = 20 # show N last values local_data = data[-window:] sv = (len(data) - window) * period if len(data) - window > 0 else 0 ev = len(data) * period datax = np.arange(sv, ev, period) plt.xlabel('Iterations') plt.ylabel('Fitness') plt.title('Learning curve') ax.clear() ax.plot(datax, local_data, 'b', label='Score') ax.legend() ax.grid(True) def run_iterations(): nonlocal eps, best_fitness print(f'Running {EPOCHS} epochs\n') for i in range(EPOCHS): if i % period == 0: print(f'Evaluating agent on {i} iteration...') fitness = evaluate_q(env, Q) if fitness > 0: click.secho(f'We have positive fitness {fitness:.2f}', fg='red') if fitness > best_fitness: best_fitness = fitness data.append(fitness) # reset env for each epoch agent = PolicyBasedTrader(policy=None, env=env) s = 0 # starting state a = get_next_action(agent, Q, s, eps=eps) print(f'Rollout for epoch {i}') while s is not None: # rollout r, s_ = agent.take_action(a, s) if s_ is not None: a_ = get_next_action(agent, Q, s_, eps=eps) q_update = alpha * (r + gamma * Q[s_, a_] - Q[s, a]) else: q_update = alpha * (r - Q[s, a]) a_ = None Q[s, a] += q_update s = s_ a = a_ eps = min_eps + (max_eps - min_eps) * np.exp(-decay_rate * i) ani = animation.FuncAnimation(fig, build_live_chart, interval=500) t = threading.Thread(target=run_iterations) t.start() plt.show() t.join() # Save latest data if not play: model = SarsaModel(Q=Q, eps=eps) model.save() print('\nDone!') click.secho(f'Best fitness {best_fitness:.2f}', fg='green') policy = extract_policy(agent, Q) if plot_chart: build_evaluation_chart(data, period=period) return policy