Example #1
0
    def __init__(self, config):
        self._config = config
        self._eps_schedule = LinearSchedule(self._config.eps_begin,
                                            self._config.eps_end,
                                            self._config.nsteps)
        self._lr_schedule = LinearSchedule(self._config.lr_begin,
                                           self._config.lr_end,
                                           self._config.lr_nsteps)
        self._oq = Order_Queue(self._config.order_path)
        self._mq = Message_Queue(self._config.message_path)
        self._bf = ReplayBuffer(1000000, config)

        self._action_fn = self.get_action_fn()

        self.build()
Example #2
0
def main():
    config = Config()
    config.mode = 'test'
    config.dropout = 1.0
    model = Neural_DQN(config)
    #model = DQN(config)
    model.initialize()
    oq = Order_Queue(config.order_path)
    mq = Message_Queue(config.message_path)
    rewards = evaluate_policy(model, oq, mq)
    print(np.mean(rewards))
Example #3
0
class model(object):
    def __init__(self, config):
        self._config = config
        self._eps_schedule = LinearSchedule(self._config.eps_begin,
                                            self._config.eps_end,
                                            self._config.nsteps)
        self._lr_schedule = LinearSchedule(self._config.lr_begin,
                                           self._config.lr_end,
                                           self._config.lr_nsteps)
        self._oq = Order_Queue(self._config.order_path)
        self._mq = Message_Queue(self._config.message_path)
        self._bf = ReplayBuffer(1000000, config)

        self._action_fn = self.get_action_fn()

        self.build()

    def build(self):
        pass

    def initialize(self):
        pass

    def get_random_action(self, state):
        pass

    def get_best_action(self, state):
        ### return action, q value
        pass

    def get_action(self, state):
        if np.random.random() < self._eps_schedule.get_epsilon():
            return self.get_random_action(state)[0]
        else:
            return self.get_best_action(state)[0]

    def get_random_action_fn(self):
        def random_action_fn(t, amount, state, mid_price):
            action = np.random.randint(
                self._config.L)  # action = L for market order
            price = (action -
                     self._config.L // 2) * self._config.base_point + mid_price
            return (price, action)

        return random_action_fn

    def get_action_fn(self):
        def action_fn(t, amount, state, mid_price):
            action = self.get_action(state)
            price = (action -
                     self._config.L // 2) * self._config.base_point + mid_price
            return (price, action)

        return action_fn

    def pad_state(self, states, state_history):
        tmp_states, tmp_its = zip(*states)
        tmp_state = np.concatenate(
            [np.expand_dims(state, -1) for state in tmp_states], axis=-1)
        tmp_state = np.pad(tmp_state,
                           ((0, 0), (0, 0),
                            (state_history - tmp_state.shape[-1], 0)),
                           'constant',
                           constant_values=0)
        tmp_it = tmp_its[-1]
        return ([tmp_state], [tmp_it])

    def simulate_an_episode(self, amount, T, H, start_time, order_direction,
                            action_fn, depth):
        dH = H // T
        self._mq.reset()
        lob_data = self._oq.create_orderbook_time(start_time, self._mq)
        lob = Limit_Order_book(**lob_data,
                               own_amount_to_trade=0,
                               own_init_price=-order_direction *
                               Limit_Order_book._DUMMY_VARIABLE,
                               own_trade_type=order_direction)
        rewards = []
        states = []
        actions = []
        done_mask = []

        amount_remain = amount
        cum_reward = 0

        for t in range(start_time, start_time + H - dH, dH):
            tmp1 = 1.0 * amount_remain / amount  # amount remain
            tmp2 = 1.0 * (start_time + H - t) / H  # time remain
            state = (lob.display_book(depth),
                     np.array([tmp1, tmp2], dtype=float))
            state = self.process_state(state)
            states.append(state)

            mid_price = lob.get_mid_price()
            state_input = self.pad_state(states[-self._config.state_history:],
                                         self._config.state_history)
            price, action = action_fn(start_time + H - t, amount_remain,
                                      state_input, mid_price)
            actions.append(action)
            done_mask.append(False)

            lob.update_own_order(price, amount_remain)

            for idx, message in self._mq.pop_to_next_time(t + dH):
                lob.process(**message)
                if lob.own_amount_to_trade == 0:
                    done_mask.append(True)
                    state = (lob.display_book(depth),
                             np.array([
                                 0, 1.0 * (start_time + H - self._mq._time) / H
                             ],
                                      dtype=float))
                    state = self.process_state(state)
                    states.append(state)
                    rewards.append(lob.own_reward - cum_reward)
                    break
            if done_mask[-1]:
                break
            else:
                # What is going on over here?
                rewards.append(lob.own_reward - cum_reward)
                cum_reward = lob.own_reward
                amount_remain = lob.own_amount_to_trade

        if not done_mask[-1]:
            tmp1 = 1.0 * amount_remain / amount
            tmp2 = 1.0 * (start_time + H - t - dH) / H
            state = (lob.display_book(depth),
                     np.array([tmp1, tmp2], dtype=float))
            state = self.process_state(state)
            states.append(state)
            done_mask.append(False)

            lob.update_own_order(lob.own_trade_type *
                                 Limit_Order_book._DUMMY_VARIABLE)
            if lob.own_amount_to_trade == 0:
                rewards.append(lob.own_reward - cum_reward)
            else:
                rewards.append(-Limit_Order_book._DUMMY_VARIABLE)
            tmp1 = 1.0 * lob.own_amount_to_trade / amount
            state = (lob.display_book(depth), np.array([tmp1, 0], dtype=float))
            state = self.process_state(state)
            states.append(state)
            actions.append(self._config.L)
            done_mask.append(True)
        return (states, rewards, actions, done_mask[1:])

    def sampling_buffer(self):
        for start_time in range(self._config.train_start,
                                self._config.train_end, self._config.H):
            states, rewards, actions, done_mask = self.simulate_an_episode(
                self._config.I, self._config.T, self._config.H, start_time,
                self._config.direction, self._action_fn, self._config.depth)
            self._bf.store(states, actions, rewards, done_mask)

    def process_state(self, state):
        state_book, state_it = state
        state_book = state_book.astype('float32')
        state_book[:, 0] /= 1.e6
        state_book[:, 1] /= 1.e2
        state_book[:, 2] /= 1.e6
        state_book[:, 3] /= 1.e2
        return (state_book, state_it)
	"""
	mq.reset()
	mq.jump_to_time(time)

	lob_copy = copy.deepcopy(lob)
	lob_copy.update_own_order(a_price, amount)

	for idx, message in mq.pop_to_next_time(next_time):
		lob_copy.process(**message)
		if lob_copy.own_amount_to_trade == 0:
			break

	return [lob_copy.own_amount_to_trade, lob_copy.own_reward]

path_target = '../data/%s_Q_dp_%s.npy' % (args.tic,args.V)
oq = Order_Queue(file_order)
mq = Message_Queue(file_msg)

if args.mode == 'train':
	np.save(path_target, Calculate_Q(args.V, args.H, args.T, args.I, args.L,oq,mq))
elif args.mode == 'test':
	Q = np.load(path_target)
	Optimal_Q = Optimal_strategy(Q)
	rewards = evaluate_policy(args.test_start, args.test_end, args.order_direction, args.V, args.H, args.T, oq, mq, Optimal_action)
	print(rewards)
	print(np.mean(rewards))