q_start_hist.append(np.max(qlearn.q[obs])) while not done: # obs = env.reset() # action = 1 # new_obs, reward, done, _ = env.step(action) # print(obs, action, reward, new_obs) # time.sleep(4) # print(obs) # print(qlearn.q[obs]) action = qlearn.chooseAction(obs) new_obs, reward, done, _ = env.step(action) total_reward += gamma_pow * reward gamma_pow *= gamma qlearn.learn(obs, action, reward, new_obs, done) # if (action == 1): # print(obs, action, reward, new_obs) # env.render() # time.sleep(0.01) obs = new_obs cum_total_reward += total_reward total_reward_hist.append(cum_total_reward) q = qlearn.q policy = np.argmax(q, axis=1) value = np.max(q, axis=1) policy = policy.reshape(maze_shape) value = value.reshape(maze_shape)
) # local step method, applies the action as an offset # to the state print("reward: ", reward) # print("next state observation: ",observation[:3])### # Digitize the observation to get a state joint1_position, joint2_position, joint3_position = observation[:3] nextState = build_state([ to_bin(joint1_position, joint1_bins), to_bin(joint2_position, joint2_bins), to_bin(joint3_position, joint3_bins) ]) # print("nextState", nextState) if done: last_time_steps = numpy.append(last_time_steps, [int(t + 1)]) break else: # Q-learn stuff #qlearn.learn(state, action, reward, nextState) qlearn.learn(state, action, reward, nextState, save_model_with_prefix, it) state = nextState it += 1 ##### # l = last_time_steps.tolist() # l.sort() # print("Overall score: {:0.2f}".format(last_time_steps.mean())) # print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))
class AgentQlearn: def __init__(self, env): self.env = env self.levels = levels self.ai = QLearn(self.levels) def update(self, t, i, force_execution=False): aiState = ActionState(t, i) a = self.ai.chooseAction(aiState) # print('Random action: ' + str(level) + ' for state: ' + str(aiState)) action = self.env.createAction(level=a, state=aiState, force_execution=force_execution) action.run(self.env.orderbook) i_next = self.env.determineNextInventory(action) t_next = self.env.determineNextTime(t) reward = action.getReward() state_next = ActionState(action.getState().getT(), action.getState().getI(), action.getState().getMarket()) state_next.setT(t_next) state_next.setI(i_next) #print("Reward " + str(reward) + ": " + str(action.getState()) + " with " + str(action.getA()) + " -> " + str(state_next)) self.ai.learn(state1=action.getState(), action1=action.getA(), reward=reward, state2=state_next) return (t_next, i_next) def train(self, episodes=1, force_execution=False): for episode in range(int(episodes)): for t in self.env.T: logging.info("\n" + "t==" + str(t)) for i in self.env.I: logging.info(" i==" + str(i)) logging.info("Action run " + str((t, i))) (t_next, i_next) = self.update(t, i, force_execution) while i_next != 0: if force_execution: raise Exception("Enforced execution left " + str(i_next) + " unexecuted.") logging.info("Action transition " + str((t, i)) + " -> " + str((t_next, i_next))) (t_next, i_next) = self.update(t_next, i_next, force_execution) def backtest(self, q=None, episodes=10, average=False, fixed_a=None): if q is None: q = self.ai.q else: self.ai.q = q if not q: raise Exception('Q-Table is empty, please train first.') Ms = [] #T = self.T[1:len(self.T)] for t in [self.env.T[-1]]: logging.info("\n" + "t==" + str(t)) for i in [self.env.I[-1]]: logging.info(" i==" + str(i)) actions = [] state = ActionState(t, i, {}) #print(state) if fixed_a is not None: a = fixed_a else: try: a = self.ai.getQAction(state, 0) print("t: " + str(t)) print("i: " + str(i)) print("Action: " + str(a)) # print("Q action for state " + str(state) + ": " + str(a)) except: # State might not be in Q-Table yet, more training requried. logging.info("State " + str(state) + " not in Q-Table.") break actions.append(a) action = self.env.createAction(level=a, state=state, force_execution=False) midPrice = action.getReferencePrice() #print("before...") #print(action) action.run(self.env.orderbook) #print("after...") #print(action) i_next = self.env.determineNextInventory(action) t_next = self.env.determineNextTime(t) # print("i_next: " + str(i_next)) while i_next != 0: state_next = ActionState(t_next, i_next, {}) if fixed_a is not None: a_next = fixed_a else: try: a_next = self.ai.getQAction(state_next, 0) print("t: " + str(t_next)) print("i: " + str(i_next)) print("Action: " + str(a_next)) # print("Q action for next state " + str(state_next) + ": " + str(a_next)) except: # State might not be in Q-Table yet, more training requried. # print("State " + str(state_next) + " not in Q-Table.") break actions.append(a_next) #print("Action transition " + str((t, i)) + " -> " + str(aiState_next) + " with " + str(runtime_next) + "s runtime.") runtime_next = self.env.determineRuntime(t_next) action.setState(state_next) action.update(a_next, runtime_next) action.run(self.env.orderbook) #print(action) i_next = self.env.determineNextInventory(action) t_next = self.env.determineNextTime(t_next) price = action.getAvgPrice() # TODO: last column is for for the BUY scenario only if action.getOrder().getSide() == OrderSide.BUY: profit = midPrice - price else: profit = price - midPrice Ms.append([state, midPrice, actions, price, profit]) if not average: return Ms return self.averageBacktest(Ms) def averageBacktest(self, M): # Average states within M N = [] observed = [] for x in M: state = x[0] if state in observed: continue observed.append(state) paid = [] reward = [] for y in M: if y[0] == state: paid.append(y[3]) reward.append(y[4]) N.append([state, x[1], x[2], np.average(paid), np.average(reward)]) return N def run(self, epochs_train=1, epochs_test=10): if epochs_train > 0: agent.train(episodes=epochs_train) M = agent.backtest(episodes=epochs_test, average=False) M = np.array(M) return np.mean(M[0:, 4]) def simulate(self, epochs_train=1, epochs_test=10, interval=100): from agent_utils.ui import UI UI.animate(lambda: self.run(epochs_train, epochs_test), interval=interval)
def testStateEquality(self): ai = QLearn([-1, 0, 1]) a1 = ActionState(1.0, 1.0, {'vol60': 1}) a2 = ActionState(1.0, 1.0, {'vol60': 1}) ai.learn(a1, 1, 1.0, a2) self.assertEqual(ai.getQAction(a2), 1)
# Pick an action based on the current state action = qlearn.chooseAction(state) print("action: ", action) # Execute the action and get feedback observation, reward, done, info = env.step(action) print("reward: ", reward) # print("observation: ",observation) print("q: ", qlearn.q) # Digitize the observation to get a state joint1_position, joint2_position, joint3_position = observation[:3] nextState = build_state([ to_bin(joint1_position, joint1_bins), to_bin(joint2_position, joint2_bins), to_bin(joint3_position, joint3_bins) ]) if done: last_time_steps = numpy.append(last_time_steps, [int(t + 1)]) break else: # Q-learn stuff qlearn.learn(state, action, reward, nextState) state = nextState l = last_time_steps.tolist() l.sort() print("Overall score: {:0.2f}".format(last_time_steps.mean())) print("Best 100 score: {:0.2f}".format( reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))