Ejemplo n.º 1
0
    def get_profit_probability(self, x_vector, y_vector, iv, s, r, t):
        '''
        Returns the probability of obtaining a profit with the strategy with
        in current scenario under study
        inputs:
            x_vector -> vector of underlying prices
            y_vector -> vector with Black-Scholes results
            iv -> underlying implied volatility
            s -> current underlying price
        '''
        p_profit = 0
        # Calculate break-even points
        zero_crossings = np.where(np.diff(np.sign(y_vector)))[0]
        breakevens = [x_vector[i] for i in zero_crossings]
        if len(breakevens) > 2:
            print 'ERROR: more than 2 zeroes detected'
        elif len(breakevens) == 0:
            p_profit = (0.9999 if y_vector[(len(y_vector)/2)] > 0 else 0.0001)
        else:
            # Get probability of being below the min breakeven at expiration
            # REVIEW CDF can't return zero!
            scale = s * np.exp(r * t)
            p_below = lognorm.cdf(breakevens[0], iv, scale=scale)
            # Get probability of being above the max breakeven at expiration
            p_above = lognorm.sf(breakevens[1], iv, scale=scale)
            # Get the probability of profit for the calendar
            p_profit = 1 - p_above - p_below
            print('Profit prob. with s=' + str(s) + ', iv=' + str(iv) +
                  ', b/e=' + str(breakevens))
            print('1 - ' + str(p_below) + ' - ' + str(p_above) + ' = ' +
                  str(p_profit))  # TODO debugging purposes

        return p_profit
Ejemplo n.º 2
0
def multivariate_normality(X, alpha=.05):
    """Henze-Zirkler multivariate normality test.

    Parameters
    ----------
    X : np.array
        Data matrix of shape (n_samples, n_features).
    alpha : float
        Significance level.

    Returns
    -------
    hz : float
        The Henze-Zirkler test statistic.
    pval : float
        P-value.
    normal : boolean
        True if X comes from a multivariate normal distribution.

    See Also
    --------
    normality : Test the univariate normality of one or more variables.
    homoscedasticity : Test equality of variance.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The Henze-Zirkler test [1]_ has a good overall power against alternatives
    to normality and works for any dimension and sample size.

    Adapted to Python from a Matlab code [2]_ by Antonio Trujillo-Ortiz and
    tested against the
    `MVN <https://cran.r-project.org/web/packages/MVN/MVN.pdf>`_ R package.

    Rows with missing values are automatically removed.

    References
    ----------
    .. [1] Henze, N., & Zirkler, B. (1990). A class of invariant consistent
       tests for multivariate normality. Communications in Statistics-Theory
       and Methods, 19(10), 3595-3617.

    .. [2] Trujillo-Ortiz, A., R. Hernandez-Walls, K. Barba-Rojo and L.
       Cupul-Magana. (2007). HZmvntest: Henze-Zirkler's Multivariate
       Normality Test. A MATLAB file.

    Examples
    --------
    >>> import pingouin as pg
    >>> data = pg.read_dataset('multivariate')
    >>> X = data[['Fever', 'Pressure', 'Aches']]
    >>> pg.multivariate_normality(X, alpha=.05)
    HZResults(hz=0.5400861018514641, pval=0.7173686509624891, normal=True)
    """
    from scipy.stats import lognorm

    # Check input and remove missing values
    X = np.asarray(X)
    assert X.ndim == 2, 'X must be of shape (n_samples, n_features).'
    X = X[~np.isnan(X).any(axis=1)]
    n, p = X.shape
    assert n >= 3, 'X must have at least 3 rows.'
    assert p >= 2, 'X must have at least two columns.'

    # Covariance matrix
    S = np.cov(X, rowvar=False, bias=True)
    S_inv = np.linalg.pinv(S).astype(X.dtype)  # Preserving original dtype
    difT = X - X.mean(0)

    # Squared-Mahalanobis distances
    Dj = np.diag(np.linalg.multi_dot([difT, S_inv, difT.T]))
    Y = np.linalg.multi_dot([X, S_inv, X.T])
    Djk = -2 * Y.T + np.repeat(np.diag(Y.T), n).reshape(n, -1) + \
        np.tile(np.diag(Y.T), (n, 1))

    # Smoothing parameter
    b = 1 / (np.sqrt(2)) * ((2 * p + 1) / 4)**(1 / (p + 4)) * \
        (n**(1 / (p + 4)))

    # Is matrix full-rank (columns are linearly independent)?
    if np.linalg.matrix_rank(S) == p:
        hz = n * (1 / (n**2) * np.sum(np.sum(np.exp(-(b**2) / 2 * Djk))) - 2 *
                  ((1 + (b**2))**(-p / 2)) * (1 / n) *
                  (np.sum(np.exp(-((b**2) / (2 * (1 + (b**2)))) * Dj))) +
                  ((1 + (2 * (b**2)))**(-p / 2)))
    else:
        hz = n * 4

    wb = (1 + b**2) * (1 + 3 * b**2)
    a = 1 + 2 * b**2
    # Mean and variance
    mu = 1 - a**(-p / 2) * (1 + p * b**2 / a + (p * (p + 2) * (b**4)) /
                            (2 * a**2))
    si2 = 2 * (1 + 4 * b**2)**(-p / 2) + 2 * a**(-p) * \
        (1 + (2 * p * b**4) / a**2 + (3 * p * (p + 2) * b**8) / (4 * a**4)) \
        - 4 * wb**(-p / 2) * (1 + (3 * p * b**4) / (2 * wb)
                              + (p * (p + 2) * b**8) / (2 * wb**2))

    # Lognormal mean and variance
    pmu = np.log(np.sqrt(mu**4 / (si2 + mu**2)))
    psi = np.sqrt(np.log((si2 + mu**2) / mu**2))

    # P-value
    pval = lognorm.sf(hz, psi, scale=np.exp(pmu))
    normal = True if pval > alpha else False

    HZResults = namedtuple('HZResults', ['hz', 'pval', 'normal'])
    return HZResults(hz=hz, pval=pval, normal=normal)
Ejemplo n.º 3
0
         label='2: $\mu$ and $\sigma$ estimated by MLE')
plt.legend(loc='upper right')

# In[74]:

#1D
log_lik_h0 = log_lik_lognorm(data, mu, sigma)
log_lik_mle = log_lik_lognorm(data, mu_MLE, sig_MLE)
LR_val = 2 * (log_lik_mle - log_lik_h0)
pval_h0 = 1.0 - sts.chi2.cdf(LR_val, 2)
print(pval_h0)

# In[75]:

# 1E
print(lognorm.sf(100000, sig_MLE, loc=mu_MLE, scale=np.exp(mu_MLE)))
print(lognorm.cdf(75000, sig_MLE, loc=mu_MLE, scale=np.exp(mu_MLE)))

# In[85]:

# 2A
import pandas as pd

data = pd.read_csv('sick.txt', header=0)


def norm_pdf(xvals, mu, sigma):
    pdf_vals = (1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(xvals - mu)**2 /
                                                          (2 * sigma**2)))
    return pdf_vals
Ejemplo n.º 4
0
def multivariate_normality(X, alpha=.05):
    """Henze-Zirkler multivariate normality test.

    Parameters
    ----------
    X : np.array
        Data matrix of shape (n, p) where n are the observations and p the
        variables.
    alpha : float
        Significance level.

    Returns
    -------
    normal : boolean
        True if X comes from a multivariate normal distribution.
    p : float
        P-value.

    See Also
    --------
    normality : Test the univariate normality of one or more variables.
    homoscedasticity : Test equality of variance.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The Henze-Zirkler test has a good overall power against alternatives
    to normality and is feasable for any dimension and any sample size.

    Aapted to Python from a Matlab code by Antonio Trujillo-Ortiz.

    Tested against the R package MVN.

    References
    ----------
    .. [1] Henze, N., & Zirkler, B. (1990). A class of invariant consistent
       tests for multivariate normality. Communications in Statistics-Theory
       and Methods, 19(10), 3595-3617.

    .. [2] Trujillo-Ortiz, A., R. Hernandez-Walls, K. Barba-Rojo and L.
       Cupul-Magana. (2007). HZmvntest: Henze-Zirkler's Multivariate
       Normality Test. A MATLAB file.

    Examples
    --------
    1. Test for multivariate normality of 2 variables

        >>> import numpy as np
        >>> from pingouin import multivariate_normality
        >>> np.random.seed(123)
        >>> mean, cov, n = [4, 6], [[1, .5], [.5, 1]], 30
        >>> X = np.random.multivariate_normal(mean, cov, n)
        >>> normal, p = multivariate_normality(X, alpha=.05)
        >>> print(normal, p)
            True 0.7523511059223078

    2. Test for multivariate normality of 3 variables

        >>> import numpy as np
        >>> from pingouin import multivariate_normality
        >>> np.random.seed(123)
        >>> mean, cov = [4, 6, 5], [[1, .5, .2], [.5, 1, .1], [.2, .1, 1]]
        >>> X = np.random.multivariate_normal(mean, cov, 50)
        >>> normal, p = multivariate_normality(X, alpha=.05)
        >>> print(normal, p)
            True 0.46074660317578175
    """
    from scipy.stats import lognorm

    # Check input
    X = np.asarray(X)
    assert X.ndim == 2
    n, p = X.shape
    assert p >= 2

    # Covariance matrix
    S = np.cov(X, rowvar=False, bias=True)
    S_inv = np.linalg.inv(S)
    difT = X - X.mean(0)
    # Squared-Mahalanobis distances
    Dj = np.diag(np.linalg.multi_dot([difT, S_inv, difT.T]))
    Y = np.linalg.multi_dot([X, S_inv, X.T])
    Djk = -2 * Y.T + np.repeat(np.diag(Y.T), n).reshape(n, -1) + \
        np.tile(np.diag(Y.T), (n, 1))

    # Smoothing parameter
    b = 1 / (np.sqrt(2)) * ((2 * p + 1) / 4)**(1 / (p + 4)) * \
        (n**(1 / (p + 4)))

    if np.linalg.matrix_rank(S) == p:
        hz = n * (1 / (n**2) * np.sum(np.sum(np.exp(-(b**2) / 2 * Djk))) - 2 *
                  ((1 + (b**2))**(-p / 2)) * (1 / n) *
                  (np.sum(np.exp(-((b**2) / (2 * (1 + (b**2)))) * Dj))) +
                  ((1 + (2 * (b**2)))**(-p / 2)))
    else:
        hz = n * 4

    wb = (1 + b**2) * (1 + 3 * b**2)
    a = 1 + 2 * b**2
    # Mean and variance
    mu = 1 - a**(-p / 2) * (1 + p * b**2 / a + (p * (p + 2) * (b**4)) /
                            (2 * a**2))
    si2 = 2 * (1 + 4 * b**2)**(-p / 2) + 2 * a**(-p) * \
        (1 + (2 * p * b**4) / a**2 + (3 * p * (p + 2) * b**8) / (4 * a**4)) \
        - 4 * wb**(-p / 2) * (1 + (3 * p * b**4) / (2 * wb)
                              + (p * (p + 2) * b**8) / (2 * wb**2))

    # Lognormal mean and variance
    pmu = np.log(np.sqrt(mu**4 / (si2 + mu**2)))
    psi = np.sqrt(np.log((si2 + mu**2) / mu**2))

    # P-value
    pval = lognorm.sf(hz, psi, scale=np.exp(pmu))
    normal = True if pval > alpha else False
    return normal, pval
Ejemplo n.º 5
0
 def hazard_pdf(self, t):
     return lognorm.pdf(t, self.sigma, 0, np.exp(self.mu)) / lognorm.sf(
         t, self.sigma, 0, np.exp(self.mu))
Ejemplo n.º 6
0
    def estimate(self, sampler):

        # A wireless interface can increase or decrease its line rate
        # so the line rate is checked regularly for WiFi.
        if (self.interface_type == InterfaceType.Wireless):
            dummy,self.linerate = self.get_linerate_wireless(sampler.get_interface())

        t = time.time()
        est_timer = t - self.time_of_last_calc
        self.time_of_last_calc =  t

#        self.request_queue.put('r')
        self.request_queue.put('rate_data')
        rate_data = self.sample_queue.get()
        self.sample_queue.task_done()
        tx_agg = rate_data['tx_agg']
        rx_agg = rate_data['rx_agg']
        samples = rate_data['samples']
        txb2 = rate_data['txb2']
        rxb2 = rate_data['rxb2']

        n = samples - self.last_samples

        # Approximately kbytes/sec, but not really since we have a
        # measurement jitter of the number of samples recorded in each
        # sampling period. (Usually, by default ms). (The sampling
        # often cannot keep up).
        self.mean_tx = (tx_agg - self.last_tx_agg) / n
        self.mean_rx = (rx_agg - self.last_rx_agg) / n

        mean_square_tx = self.mean_tx*self.mean_tx
        mean_square_rx = self.mean_rx*self.mean_rx

        sum_square_tx = (txb2 - self.last_txb2) / n
        sum_square_rx = (rxb2 - self.last_rxb2) / n

        # NOTE: Rounding to 5 decimals is perhaps correct if we get
        # negative variance due to the measurement jitter.
        # It is not clear why we get a measurement jitter, so why this
        # is necessary is a somewhat of a mystery.
        self.var_tx = sum_square_tx - mean_square_tx
        if self.var_tx < 0:
            if self.display_data:
                print("\33[9;1H")  # 
                print("\33[0J")
            print("WARNING: self.var_tx == " + str(self.var_tx))
            self.var_tx = round(sum_square_tx - mean_square_tx,5) # round to avoid negative value
        self.var_rx = sum_square_rx - mean_square_rx
        if self.var_rx < 0:
            if self.display_data:
                print("\33[10;1H")  # 
                print("\33[0J")
            print("WARNING: self.var_rx == " + str(self.var_rx))
            self.var_rx = round(sum_square_rx - mean_square_rx,5) # round to avoid negative value

        if self.debug and False:
            print("\33[12;1H")
            print("\33[0J################### DEBUG ##################")
            print("\33[0Jest_timer:      %f"%est_timer)
            print("\33[0Jself.mean_tx:   %f        self.mean_rx:  %f"%(self.mean_tx,self.mean_rx))
            print("\33[0Jtxb2:           %f        rxb2           %f"%(txb2,rxb2))
            print("\33[0Jself.last_txb2  %f        self.last_rxb2 %f"%(self.last_txb2,self.last_rxb2))
            print("\33[0Jmean_square_tx  %f        mean_square_rx %f"%((mean_square_tx),(mean_square_rx)))
            print("\33[0Jsum_square_tx   %f        sum_square_rx  %f"%(sum_square_tx,sum_square_rx))
            print("\33[0Jself.var tx:         %f        self.var_rx:        %f"%(self.var_tx,self.var_rx))


        self.last_samples = samples

        self.last_tx_agg = tx_agg
        self.last_rx_agg = rx_agg

        self.last_txb2 = txb2
        self.last_rxb2 = rxb2

        # Estimate the moments
        try:
            if self.mean_tx != 0.0:
                self.sigma2_tx = math.log(1.0+(self.var_tx/mean_square_tx))
                self.mu_tx = math.log(self.mean_tx) - (self.sigma2_tx/2.0)
            else:
#                self.sigma2_tx = float('nan')
                self.sigma2_tx = 0.0
                self.mu_tx = 0.0

            if self.mean_rx != 0.0:
                self.sigma2_rx = math.log(1.0+(self.var_rx/(mean_square_rx)))
                self.mu_rx = math.log(self.mean_rx) - (self.sigma2_rx/2.0)
            else:
#                self.sigma2_rx = float('nan')
                self.sigma2_rx = 0.0
                self.mu_rx = 0.0

        # Calculate the overload risk

## Based on the original code, using the CDF (Cumulative Distribution Function).
#            self.overload_risk_tx = (1-lognorm.cdf(self.linerate * self.cutoff,math.sqrt(self.sigma2_tx),0,math.exp(self.mu_tx)))*100
#            self.overload_risk_rx = (1-lognorm.cdf(self.linerate * self.cutoff,math.sqrt(self.sigma2_rx),0,math.exp(self.mu_rx)))*100

## Using the survival function (1 - cdf). See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html for a motivation).
            self.overload_risk_tx = (lognorm.sf(self.linerate * self.cutoff,math.sqrt(self.sigma2_tx),0,math.exp(self.mu_tx)))*100
            self.overload_risk_rx = (lognorm.sf(self.linerate * self.cutoff,math.sqrt(self.sigma2_rx),0,math.exp(self.mu_rx)))*100

### According to our dicussion, using the PPF (Percentile Point Function (or Quantile function).
#            self.cutoff_rate_tx = (1-lognorm.ppf( self.cutoff,math.sqrt(self.sigma2_tx),0,math.exp(self.mu_tx)))
#            self.cutoff_rate_rx = (1-lognorm.ppf( self.cutoff,math.sqrt(self.sigma2_rx),0,math.exp(self.mu_rx)))
            # To estimate a risk: compare the calculated cutoff rate with the nominal line rate.

        except ValueError as ve:
            if self.display_data:
                print("\33[2K")
            print("Error in estimation: ({}):".format(ve))
            traceback.print_exc()
            if self.display_data:
                print("\33[2K")
            print("mean_tx: %.2e, mean_rx: %.2e "%(self.mean_tx,self.mean_rx))
            if self.display_data:
                print("\33[2K")
            print("var_tx: %.2e, var_rx: %.2e "%(self.var_tx,self.var_rx))
            if self.display_data:
                print("\33[2K")
            print("mean_square_tx: %.2e, mean_square_rx: %.2e "%(mean_square_tx,mean_square_rx))
            if self.display_data:
                print("\33[2K")
            print("rate_data: %s"%(rate_data,))
            exit(1)

        if self.display_data:
            try:
                print("\33[H",end="") # move cursor home
    # [PD] 2016-05-23, The calculation of "actual" seems to be buggy.
    #            print("\33[2KEstimate (sample_rate: {:d} actual({:d}), interface: {}, linerate: {:d}".format(sampler.get_sample_rate(), n, sampler.get_interface(),self.linerate))
                print("\33[2Ksample_rate (/s): {:d}, interface: {}, linerate (bytes/s): {:d}, link speed (Mbit/s): {:d}".format(sampler.get_sample_rate(), sampler.get_interface(),self.linerate,self.link_speed))
                print("\33[2KTX(mean: %.2e b/s std: %.2e mu: %.2e s2: %.2e, ol-risk: %.2e) "%(self.mean_tx,math.sqrt(self.var_tx),self.mu_tx,self.sigma2_tx, self.overload_risk_tx))
                print("\33[2KRX(mean: %.2e b/s std: %.2e mu: %.2e s2: %.2e, ol-risk: %.2e) "%(self.mean_rx,math.sqrt(self.var_rx),self.mu_rx,self.sigma2_rx, self.overload_risk_rx))
                print("\33[2Kestimation timer: {:.4f}".format(est_timer))
                print("\33[2Kestimation interval: {:.2f}".format(self.est_interval))
                print("\33[2Kmeter interval: %d"%(self.meter_interval))
                print("\33[2Kmode: %d"%(self.mode))
                if self.debug:
                    print("\33[2Kdebug: %s"%str(self.debug))
                    print("\33[2Ksample_queue size: %s"%str(self.sample_queue.qsize()))
            except ValueError as ve:
                print("\33[2KError in display ({}):".format(ve))
                traceback.print_exc()
                print("\33[2Kvar_tx: %.2e, var_rx: %.2e "%(self.var_tx,self.var_rx))
                print("\33[2Krate_data: %s"%(rate_data,))
                exit(1)

        # FIXME: It should not be necessary to empty the queue here
        # anymore, since the monitor code only puts stuff in the Queue
        # on request.
        # Verify this before remove this while loop!
        while not self.sample_queue.empty():
            self.sample_queue.get()
            self.sample_queue.task_done()
Ejemplo n.º 7
0
        current_price = None
        if not args.current_price:
            # TODO Get current underlying price from Yahoo or similar
            pass
        else:
            current_price = args.current_price
            current_iv = args.iv
            if len(breakevens) > 2:
                print 'ERROR: more than 2 zeroes detected'
            else:
                # Get probability of underlying being below first zero
                # REVIEW CDF can't return zero!
                scale = current_price * np.exp(r * t)
                p_below = lognorm.cdf(breakevens[0], current_iv, scale=scale)
                # Get probability of underlying being above second zero
                p_above = lognorm.sf(breakevens[1], current_iv, scale=scale)
                # Get the probability of profit for the calendar
                p_profit = 1 - p_above - p_below
                print('Probabilities (below, above, profit): ' + str(p_below) +
                      ' - ' + str(p_above) + ' - ' + str(p_profit))

        calls = df[df['m_right'] == 'C']
        plot_calendars(calls, near_term, next_term, current_price)
        sys.exit()  # TODO remove to go on
        '''
        # Do all the possible combinations of expiries for calendar spreads
        expiry_combinations = list(combinations(expiries, r=2))

        # Iterate calls
        for (near_term, next_term) in expiry_combinations:
            if near_term > next_term:
Ejemplo n.º 8
0
    plt.savefig('fig_1c')
    plt.close()

# 1d

log_lik_h0 = log_lik_lognorm(incomes_array, mu_1b, sigma_1b)
log_lik_mle = log_lik_lognorm(incomes_array, mu_mle, sigma_mle)
LR_val = 2 * (log_lik_mle - log_lik_h0)
pval_h0 = 1.0 - sts.chi2.cdf(LR_val, 2)
print('1d: p-value from the chi-square test: {:.3f}\n'.format(pval_h0))

# 1e

print('1e: Probability of earning more than $100,000: {:.4f}'.format(
    lognorm.sf(100000, sigma_mle, loc=mu_mle, scale=np.exp(mu_mle))))

print('    Probability of earning less than $75,000: {:.4f}\n'.format(
    lognorm.cdf(75000, sigma_mle, loc=mu_mle, scale=np.exp(mu_mle))))

# Q2

sick_df = pd.read_csv('sick.txt')
sick_df.rename(columns={'\ufeffsick': 'sick'}, inplace=True)


def log_like_sick(sick_df, b0, b1, b2, b3, sigma):
    error = sick_df.sick - b0 - b1*sick_df.age - b2*sick_df.children - b3*\
            sick_df.avgtemp_winter
    pdf_vals = norm_pdf(error, 0, sigma)
    log_lik_val = np.log(pdf_vals).sum()