Exemple #1
0
    def get_broker_episode_step(self, **kwargs):
        """

        Returns:
            exp. scaled episode duration in steps, normalized wrt. max possible episode steps
        """
        return exp_scale(self.iteration /
                         (self.data.numrecords - self.inner_embedding),
                         gamma=3)
Exemple #2
0
    def get_reward(self):
        """
        Shapes reward function as normalized single trade realized profit/loss,
        augmented with potential-based reward shaping functions in form of:
        F(s, a, s`) = gamma * FI(s`) - FI(s);

        - potential FI_1 is current normalized unrealized profit/loss;
        - potential FI_2 is current normalized broker value.
        - FI_3: penalizing exposure toward the end of episode

        Paper:
            "Policy invariance under reward transformations:
             Theory and application to reward shaping" by A. Ng et al., 1999;
             http://www.robotics.stanford.edu/~ang/papers/shaping-icml99.pdf
        """

        # All sliding statistics for this step are already updated by get_state().
        debug = {}

        # Potential-based shaping function 1:
        # based on potential of averaged profit/loss for current opened trade (unrealized p/l):
        unrealised_pnl = np.asarray(self.sliding_stat['unrealized_pnl'])
        f1 = self.p.gamma * np.average(unrealised_pnl[1:]) - np.average(
            unrealised_pnl[:-1])
        #f1 = self.p.gamma * discounted_average(unrealised_pnl[1:], self.p.gamma)\
        #     - discounted_average(unrealised_pnl[:-1], self.p.gamma)

        debug['f1'] = f1

        # Potential-based shaping function 2:
        # based on potential of averaged broker value, normalized wrt to max drawdown and target bounds.
        norm_broker_value = np.asarray(self.sliding_stat['broker_value'])
        f2 = self.p.gamma * np.average(norm_broker_value[1:]) - np.average(
            norm_broker_value[:-1])
        #f2 = self.p.gamma * discounted_average(norm_broker_value[1:], self.p.gamma)\
        #     - discounted_average(norm_broker_value[:-1], self.p.gamma)

        debug['f2'] = f2

        # Potential-based shaping function 3:
        # negative potential of abs. size of position, exponentially weighted wrt. episode steps
        abs_exposure = np.abs(np.asarray(self.sliding_stat['exposure']))
        time = np.asarray(self.sliding_stat['episode_step'])
        #time_w = exp_scale(np.average(time[:-1]), gamma=5)
        #time_w_prime = exp_scale(np.average(time[1:]), gamma=5)
        #f3 = - 1.0 * time_w_prime * np.average(abs_exposure[1:]) #+ time_w * np.average(abs_exposure[:-1])
        f3 = - self.p.gamma * exp_scale(time[-1], gamma=3) * abs_exposure[-1] + \
             exp_scale(time[-2], gamma=3) * abs_exposure[-2]
        debug['f3'] = f3

        # Main reward function: normalized realized profit/loss:
        realized_pnl = np.asarray(self.sliding_stat['realized_pnl'])[-1]
        debug['f_real_pnl'] = 10 * realized_pnl

        # Weights are subject to tune:
        self.reward = 1.0 * f1 + 1.0 * f2 + 0.0 * f3 + 10.0 * realized_pnl

        debug['r'] = self.reward
        debug['b_v'] = self.sliding_stat['broker_value'][-1]
        debug['unreal_pnl'] = self.sliding_stat['unrealized_pnl'][-1]
        debug['iteration'] = self.iteration

        #for k, v in debug.items():
        #    print('{}: {}'.format(k, v))
        #print('\n')

        # TODO: ------ignore-----:
        # 'Do-not-expose-for-too-long' shaping term:
        # - 1.0 * self.exp_scale(avg_norm_position_duration, gamma=3)

        self.reward = np.clip(self.reward, -1, 1)

        return self.reward
Exemple #3
0
    def update_sliding_stat(self):
        """
        Updates all sliding statistics deques with latest-step values:
            - normalized broker value
            - normalized broker cash
            - normalized exposure (position size)
            - position duration in steps, normalized wrt. max possible episode steps
            - exp. scaled episode duration in steps, normalized wrt. max possible episode steps
            - normalized decayed realized profit/loss for last closed trade
                (or zero if no trades been closed within last step);
            - normalized profit/loss for current opened trade (unrealized p/l);
            - normalized best possible up to present point unrealized result for current opened trade;
            - normalized worst possible up to present point unrealized result for current opened trade;
            - one hot encoding for actions received;
            - rewards received (based on self.reward variable values);
        """
        stat = self.sliding_stat
        current_value = self.env.broker.get_value()

        stat['broker_value'].append(
            norm_value(
                current_value,
                self.env.broker.startingcash,
                self.p.drawdown_call,
                self.p.target_call,
            ))
        stat['broker_cash'].append(
            norm_value(
                self.env.broker.get_cash(),
                self.env.broker.startingcash,
                99.0,
                self.p.target_call,
            ))
        stat['exposure'].append(
            self.position.size /
            (self.env.broker.startingcash * self.env.broker.get_leverage() +
             1e-2))
        stat['leverage'].append(
            self.env.broker.get_leverage())  # TODO: Do we need this?

        if self.trade_just_closed:
            stat['realized_pnl'].append(
                decayed_result(self.trade_result,
                               current_value,
                               self.env.broker.startingcash,
                               self.p.drawdown_call,
                               self.p.target_call,
                               gamma=1))
            # Reset flag:
            self.trade_just_closed = False
            # print('POS_OBS: step {}, just closed.'.format(self.iteration))

        else:
            stat['realized_pnl'].append(0.0)

        if self.position.size == 0:
            self.current_pos_duration = 0
            self.current_pos_min_value = current_value
            self.current_pos_max_value = current_value
            # print('ZERO_POSITION\n')

        else:
            self.current_pos_duration += 1
            if self.current_pos_max_value < current_value:
                self.current_pos_max_value = current_value

            elif self.current_pos_min_value > current_value:
                self.current_pos_min_value = current_value

        stat['pos_duration'].append(
            self.current_pos_duration /
            (self.data.numrecords - self.inner_embedding))
        stat['episode_step'].append(
            exp_scale(self.iteration /
                      (self.data.numrecords - self.inner_embedding),
                      gamma=3))
        stat['max_unrealized_pnl'].append(
            (self.current_pos_max_value - self.realized_broker_value) *
            self.broker_value_normalizer)
        stat['min_unrealized_pnl'].append(
            (self.current_pos_min_value - self.realized_broker_value) *
            self.broker_value_normalizer)
        stat['unrealized_pnl'].append(
            (current_value - self.realized_broker_value) *
            self.broker_value_normalizer)
        stat['action'].append(self.action_norm(self.last_action))
        stat['reward'].append(self.reward)