Esempio n. 1
0
    joint1_position, joint2_position, joint3_position = observation[:3]
    state = build_state([
        to_bin(joint1_position, joint1_bins),
        to_bin(joint2_position, joint2_bins),
        to_bin(joint3_position, joint3_bins)
    ])

    for t in range(max_number_of_steps):
        env.render()
        # print("join1_bins", joint1_bins)
        # print("Number of steps", t)
        # print("q: ",qlearn.q)
        # Pick an action based on the current f
        # print("state: ",state)
        # Pick an action based on the current state
        action = qlearn.chooseAction(state)
        # print("state:", state)##
        # print("action: ",action)
        # print("state observation: ",observation[:3])###

        # Execute the action and get feedback
        # observation, reward, done, info = env.step(action) # environment step method
        observation, reward, done, info = step(
            action, observation[:3]
        )  # local step method, applies the action as an offset
        # to the state
        print("reward: ", reward)
        # print("next state observation: ",observation[:3])###

        # Digitize the observation to get a state
        joint1_position, joint2_position, joint3_position = observation[:3]
Esempio n. 2
0
        print("Episode {}".format(e))

    gamma = 0.9
    gamma_pow = 1
    total_reward = 0

    q_start_hist.append(np.max(qlearn.q[obs]))
    while not done:
        # obs = env.reset()
        # action = 1
        # new_obs, reward, done, _ = env.step(action)
        # print(obs, action, reward, new_obs)
        # time.sleep(4)
        # print(obs)
        # print(qlearn.q[obs])
        action = qlearn.chooseAction(obs)
        new_obs, reward, done, _ = env.step(action)
        total_reward += gamma_pow * reward
        gamma_pow *= gamma

        qlearn.learn(obs, action, reward, new_obs, done)
        # if (action == 1):
        # print(obs, action, reward, new_obs)
        # env.render()
        # time.sleep(0.01)
        obs = new_obs

    cum_total_reward += total_reward
    total_reward_hist.append(cum_total_reward)

q = qlearn.q
Esempio n. 3
0
class AgentQlearn:
    def __init__(self, env):
        self.env = env
        self.levels = levels
        self.ai = QLearn(self.levels)

    def update(self, t, i, force_execution=False):
        aiState = ActionState(t, i)
        a = self.ai.chooseAction(aiState)
        # print('Random action: ' + str(level) + ' for state: ' + str(aiState))
        action = self.env.createAction(level=a,
                                       state=aiState,
                                       force_execution=force_execution)
        action.run(self.env.orderbook)
        i_next = self.env.determineNextInventory(action)
        t_next = self.env.determineNextTime(t)
        reward = action.getReward()
        state_next = ActionState(action.getState().getT(),
                                 action.getState().getI(),
                                 action.getState().getMarket())
        state_next.setT(t_next)
        state_next.setI(i_next)
        #print("Reward " + str(reward) + ": " + str(action.getState()) + " with " + str(action.getA()) + " -> " + str(state_next))
        self.ai.learn(state1=action.getState(),
                      action1=action.getA(),
                      reward=reward,
                      state2=state_next)
        return (t_next, i_next)

    def train(self, episodes=1, force_execution=False):
        for episode in range(int(episodes)):
            for t in self.env.T:
                logging.info("\n" + "t==" + str(t))
                for i in self.env.I:
                    logging.info("     i==" + str(i))
                    logging.info("Action run " + str((t, i)))
                    (t_next, i_next) = self.update(t, i, force_execution)
                    while i_next != 0:
                        if force_execution:
                            raise Exception("Enforced execution left " +
                                            str(i_next) + " unexecuted.")
                        logging.info("Action transition " + str((t, i)) +
                                     " -> " + str((t_next, i_next)))
                        (t_next, i_next) = self.update(t_next, i_next,
                                                       force_execution)

    def backtest(self, q=None, episodes=10, average=False, fixed_a=None):
        if q is None:
            q = self.ai.q
        else:
            self.ai.q = q

        if not q:
            raise Exception('Q-Table is empty, please train first.')

        Ms = []
        #T = self.T[1:len(self.T)]
        for t in [self.env.T[-1]]:
            logging.info("\n" + "t==" + str(t))
            for i in [self.env.I[-1]]:
                logging.info("     i==" + str(i))
                actions = []
                state = ActionState(t, i, {})
                #print(state)
                if fixed_a is not None:
                    a = fixed_a
                else:
                    try:
                        a = self.ai.getQAction(state, 0)
                        print("t: " + str(t))
                        print("i: " + str(i))
                        print("Action: " + str(a))
                        # print("Q action for state " + str(state) + ": " + str(a))
                    except:
                        # State might not be in Q-Table yet, more training requried.
                        logging.info("State " + str(state) +
                                     " not in Q-Table.")
                        break
                actions.append(a)
                action = self.env.createAction(level=a,
                                               state=state,
                                               force_execution=False)
                midPrice = action.getReferencePrice()

                #print("before...")
                #print(action)
                action.run(self.env.orderbook)
                #print("after...")
                #print(action)
                i_next = self.env.determineNextInventory(action)
                t_next = self.env.determineNextTime(t)
                # print("i_next: " + str(i_next))
                while i_next != 0:
                    state_next = ActionState(t_next, i_next, {})
                    if fixed_a is not None:
                        a_next = fixed_a
                    else:
                        try:
                            a_next = self.ai.getQAction(state_next, 0)
                            print("t: " + str(t_next))
                            print("i: " + str(i_next))
                            print("Action: " + str(a_next))
                            # print("Q action for next state " + str(state_next) + ": " + str(a_next))
                        except:
                            # State might not be in Q-Table yet, more training requried.
                            # print("State " + str(state_next) + " not in Q-Table.")
                            break
                    actions.append(a_next)
                    #print("Action transition " + str((t, i)) + " -> " + str(aiState_next) + " with " + str(runtime_next) + "s runtime.")

                    runtime_next = self.env.determineRuntime(t_next)
                    action.setState(state_next)
                    action.update(a_next, runtime_next)
                    action.run(self.env.orderbook)
                    #print(action)
                    i_next = self.env.determineNextInventory(action)
                    t_next = self.env.determineNextTime(t_next)

                price = action.getAvgPrice()
                # TODO: last column is for for the BUY scenario only
                if action.getOrder().getSide() == OrderSide.BUY:
                    profit = midPrice - price
                else:
                    profit = price - midPrice
                Ms.append([state, midPrice, actions, price, profit])
        if not average:
            return Ms
        return self.averageBacktest(Ms)

    def averageBacktest(self, M):
        # Average states within M
        N = []
        observed = []
        for x in M:
            state = x[0]
            if state in observed:
                continue
            observed.append(state)
            paid = []
            reward = []
            for y in M:
                if y[0] == state:
                    paid.append(y[3])
                    reward.append(y[4])
            N.append([state, x[1], x[2], np.average(paid), np.average(reward)])
        return N

    def run(self, epochs_train=1, epochs_test=10):
        if epochs_train > 0:
            agent.train(episodes=epochs_train)
        M = agent.backtest(episodes=epochs_test, average=False)
        M = np.array(M)
        return np.mean(M[0:, 4])

    def simulate(self, epochs_train=1, epochs_test=10, interval=100):
        from agent_utils.ui import UI
        UI.animate(lambda: self.run(epochs_train, epochs_test),
                   interval=interval)