def norminvcdf(qin):
    eps = 1.e-320
    batch_ppf_fn = lambda q: norm.ppf(1.-1.e-16) if q==1. else norm.ppf(q+eps)
    if(len(qin)==1):
        return batch_ppf_fn(qin)
    else:
        return np.array(map(batch_ppf_fn, qin))
Example #2
0
def dprime_yes_no(H, FA):
    """
    Compute *d'* for one interval 'yes/no' type tasks from hits and false alarm rates.

    Parameters
    ----------
    H : float
        Hit rate.
    FA : float
        False alarms rate.

    Returns
    -------
    dprime : float
        *d'* value

    Examples
    --------
    >>> dp = dprime_yes_no(0.7, 0.2)

    References
    ----------
    .. [1] Green, D. M., & Swets, J. A. (1988). *Signal Detection Theory and Psychophysics*. Los Altos, California: Peninsula Publishing.
    .. [2] Macmillan, N. A., & Creelman, C. D. (2004). *Detection Theory: A User’s Guide (2nd ed.)*. London: Lawrence Erlbraum Associates.

    """
    
    if H < 0 or H > 1:
        raise ValueError("H must be between 0 and 1")
    if FA < 0 or FA > 1:
        raise ValueError("FA must be between 0 and 1")

    return norm.ppf(H) - norm.ppf(FA)
Example #3
0
    def compute_null_stats(self, elec_pair_phase_diff, recalled, elec_pair_stats):

        res = Parallel(n_jobs=12, verbose=5)(delayed(calc_circ_stats)(elec_pair_phase_diff, recalled, True)
                                             for _ in range(self.n_perms))

        # for the rayleigh z and the resultant vector length, compute the actual difference between good and bad
        # memory at each timepoint. Then compute a null distribution from shuffled data. Then compute the rank of the
        # real data compared to the shuffled at each timepoint. Convert rank to z-score and return
        null_elec_pair_zs_rec = np.stack([x['elec_pair_z_rec'] for x in res], 0)
        null_elec_pair_zs_nrec = np.stack([x['elec_pair_z_nrec'] for x in res], 0)
        null_delta_mem_zs = null_elec_pair_zs_rec - null_elec_pair_zs_nrec
        real_delta_mem_zs = elec_pair_stats['elec_pair_z_rec'] - elec_pair_stats['elec_pair_z_nrec']
        delta_mem_zs_rank = np.mean(real_delta_mem_zs > null_delta_mem_zs, axis=0)
        delta_mem_zs_rank[delta_mem_zs_rank == 0] += 1/self.n_perms
        delta_mem_zs_rank[delta_mem_zs_rank == 1] -= 1 / self.n_perms

        null_elec_pair_rvls_rec = np.stack([x['elec_pair_rvl_rec'] for x in res], 0)
        null_elec_pair_rvls_nrec = np.stack([x['elec_pair_rvl_nrec'] for x in res], 0)
        null_delta_mem_rvls = null_elec_pair_rvls_rec - null_elec_pair_rvls_nrec
        real_delta_mem_rvls = elec_pair_stats['elec_pair_rvl_rec'] - elec_pair_stats['elec_pair_rvl_nrec']
        delta_mem_rvls_rank = np.mean(real_delta_mem_rvls > null_delta_mem_rvls, axis=0)
        delta_mem_rvls_rank[delta_mem_rvls_rank == 0] += 1/self.n_perms
        delta_mem_rvls_rank[delta_mem_rvls_rank == 1] -= 1 / self.n_perms

        return norm.ppf(delta_mem_zs_rank), norm.ppf(delta_mem_rvls_rank)
Example #4
0
def draw_tile(metadata, config, target_path):
    decoder = config.build_decoder()
    decoder_layers = nn.layers.get_all_layers(decoder.l_out)
    print "  decoder layer output shapes:"
    nparams = len(nn.layers.get_all_params(decoder.l_out))
    nn.layers.set_all_param_values(decoder.l_out, metadata['param_values'][-nparams:])

    for layer in decoder_layers:
        name = layer.__class__.__name__
        print "    %s %s" % (string.ljust(name, 32), nn.layers.get_output_shape(layer))

    mesh = np.linspace(0.001, 0.999, 20)
    z = np.zeros((400, 2), dtype='float32')
    for i in xrange(20):
        for j in xrange(20):
            z[20 * i + j, :] = np.array([norm.ppf(mesh[i]), norm.ppf(mesh[j])])

    sample = theano.function([decoder.l_z.input_var], nn.layers.get_output(decoder_layers[-1]))

    digits = sample(z)

    tile = np.zeros((20 * 28, 20 * 28), dtype='float32')

    for i in xrange(20):
        for j in xrange(20):
            d = np.reshape(digits[20 * i + j, :], (28, 28))
            tile[i * 28:(i + 1) * 28, j * 28:(j + 1) * 28] = d

    plt.imsave(target_path + 'tile.png', tile, cmap=matplotlib.cm.Greys)
def bca(data, alphas, statarray, statfunction, ostat, reps):
    '''Subroutine called to calculate the BCa statistics. Borrowed heavily from scikits.bootstrap code.'''

    # The bias correction value.
    z0=norm.ppf( ( 1.0*np.sum(statarray < ostat, axis=0)  ) / reps )

    # Statistics of the jackknife distribution
    jackindexes=jackknife_indexes(data[0]) # I use the scikits.bootstrap function here.
    jstat=[statfunction(*(x[indexes] for x in data)) for indexes in jackindexes]
    jmean=np.mean(jstat,axis=0)

    # Acceleration value
    a=np.sum( (jmean - jstat)**3, axis=0 ) / ( 6.0 * np.sum( (jmean - jstat)**2, axis=0)**1.5 )
    if np.any(np.isnan(a)):
        nanind=np.nonzero(np.isnan(a))
        warnings.warn("Some acceleration values were undefined. \
            This is almost certainly because all values \
            for the statistic were equal. Affected \
            confidence intervals will have zero width and \
            may be inaccurate (indexes: {}). \
            Other warnings are likely related.".format(nanind))
    zs=z0 + norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim)
    avals=norm.cdf(z0 + zs/(1-a*zs))
    nvals=np.round((reps-1)*avals)
    nvals=np.nan_to_num(nvals).astype('int')
    
    return nvals
Example #6
0
def plot3(A,B,S,T,D, tMAP=None):
    States = D.Y
    pMAP = [9.75961269, 0.0583687877, 78.4901534, 78.1696975]
    alpha, beta, sig2, ome2 = pMAP
    tMAP = (alpha, beta, sig2, ome2, States, D.Y)

    x1 = np.linspace(norm.ppf(0.01), norm.ppf(0.99), 100)
    x2 = np.linspace(invgamma.ppf(0.01, S.prior.p1/2), invgamma.ppf(0.99, S.prior.p1/2), 100)
    x3 = np.linspace(invgamma.ppf(0.01, T.prior.p1/2), invgamma.ppf(0.99, T.prior.p1/2), 100)

    tmp = lambda (a, b, s, t, X, Y): 1/(1/A.prior.var + D.n/S.prior.p2)
    print tMAP
    print tmp(tMAP)
    print A.cond.var(tMAP)
    print A.cond.m(tMAP)
    print A.cond.rv((tMAP))

    plt.plot(x1, A.cond.rv(tMAP).pdf(x1), 'r-', lw=5, alpha=0.6, label='a prior')
    plt.title('alpha conditional at MAP')
    plt.show()

    plt.plot(x1, B.cond.rv(tMAP).pdf(x1), 'r-', lw=5, alpha=0.6, label='b prior')
    plt.title('beta conditional at MAP')
    plt.show()

    plt.plot(x2, S.cond.rv(tMAP).pdf(x2), 'r-', lw=5, alpha=0.6, label='sig2 prior')
    plt.title('sig2 conditional at MAP')
    plt.show()

    plt.plot(x3, T.cond.rv(tMAP).pdf(x3), 'r-', lw=5, alpha=0.6, label='sig2 prior')
    plt.title('sig2 conditional at MAP')
    plt.show()
Example #7
0
    def __init__(self,  MAX_DUR,  usePersistentProbs,  distributionType=1):
        '''
        '''    
        # 1 - normal. 
        # 2 - gamma TODO: implement
        # 3 - exponential
        self.distributionType = distributionType
        
        self.numDurs = int(2* deviationInSec * NUMFRAMESPERSEC)
        if not self.numDurs % 2:
            self.numDurs += 1
        

        
        '''
        maxDur x currDur lookupTable of probs
        '''
        self.R_MAX = MAX_DUR
        '''
        how much of a phoneme may be longer than its score-assigned max_dur 
        '''
        self.MAX_ALLOWED_DURATION_RATIO = 1
        
        if distributionType == 1:
            self.MAX_ALLOWED_DURATION_RATIO = 2
            
        self.lookupTableLogLiks  = numpy.empty((MAX_DUR, self.R_MAX + (self.numDurs-1) /2 + 1))
        self.lookupTableLogLiks.fill(-Infinity)
        
        if distributionType == 1:
            self.minVal = norm.ppf(0.01)
            self.maxVal= norm.ppf(0.99)
                    
        self._constructLogLiksTable(usePersistentProbs)
Example #8
0
  def adjust_thresholds(self, val_data, alpha, batchsize, nrepeats=10000, maxrepeats=1e6):

    self.thr_Z = normal_dist.ppf(alpha, loc=self.valmean, scale=self.valstd/np.sqrt(batchsize)), np.inf # no upper bound, only lower 
    self.thr_logZ = normal_dist.ppf(alpha, loc=self.vallogmean, scale=self.vallogstd/np.sqrt(batchsize)), np.inf # no upper bound, only lower 
    self.thr_symZ = normal_dist.ppf(0.5*alpha, loc=self.valmean, scale=self.valstd/np.sqrt(batchsize)), normal_dist.ppf(1.-0.5*alpha, loc=self.valmean, scale=self.valstd/np.sqrt(batchsize))
    self.thr_symlogZ = normal_dist.ppf(0.5*alpha, loc=self.vallogmean, scale=self.vallogstd/np.sqrt(batchsize)), normal_dist.ppf(1.-0.5*alpha, loc=self.vallogmean, scale=self.vallogstd/np.sqrt(batchsize))
    
    nrepeats = max( nrepeats, np.int(np.ceil(2./alpha)) )
    if nrepeats <= maxrepeats:
      mean_stat,logmean_stat = [],[]
      for i in range(nrepeats):
        batch = make_batch(val_data, batchsize)
        mean_stat.append( np.mean(batch) )
        logmean_stat.append( np.mean(np.log(batch) ) )
      
      mean_stat = np.sort(mean_stat)
      logmean_stat = np.sort(logmean_stat)
      index = np.int(np.floor(alpha*nrepeats)) # number of permitted outliers
      
      self.thr_mean = mean_stat[index], np.inf
      self.thr_logmean = logmean_stat[index], np.inf
      self.thr_symmean = mean_stat[(index-1)//2], mean_stat[-index//2]
      self.thr_symlogmean = logmean_stat[(index-1)//2], mean_stat[-index//2]
    else: # disable tests
      self.thr_mean = -np.inf, np.inf
      self.thr_logmean = -np.inf, np.inf
      self.thr_symmean = -np.inf, np.inf
      self.thr_symlogmean = -np.inf, np.inf
Example #9
0
def d_prime(hits, false_alarms, n, nafc=1):
    """
    Calculate the sensitivity index d'.

    Parameters
    ----------
    hits : float
        The number of hits when detecting a signal.
    false_alarms : float
        The number of false alarms.
    n : int
        The number of trials in target and no-target trials.
    nafc : int, optional
        The number of alternative choices in the task. A value of ``1``
        implies a Yes/No task.
        Defaults to 1.

    Returns
    -------
    d : float
        The calculated d' value, z(hit_rate) - z(fa_rate).

    Example
    -------
    >>> from pphelper import sdt
    >>> sdt.d_prime(20, 10, 25)
    1.094968336708714

    """
    if nafc != 1:
        raise NotImplementedError('Only 1-AFC implemented so far.')

    hit_rate, fa_rate = _calculate_hit_and_fa_rates(hits, false_alarms, n)
    d = norm.ppf(hit_rate) - norm.ppf(fa_rate)
    return d
Example #10
0
def plot_quantiles(path = None):
    """
    Plot definition of quantile
    """
    vals = np.random.randn(1000,)
    fig = plt.figure(figsize=(10, 3))

    quantiles = [.05, .25, .5, .75, .95]

    ax = fig.add_subplot(121)
    ax.plot(vals, norm.cdf(vals), marker = "o", linestyle = "", markersize = 1.5)
    ax.set_xlim(-2.5, 2.5)
    ax.set_title("Cumulative distribution function")
    ax.set_xlabel("Values")
    ax.set_ylabel("Probability")
    for i,e in enumerate(quantiles):
        ax.axvline(x = norm.ppf(e), color = "red")

    ax = fig.add_subplot(122)
    ax.plot(vals, color = "black")
    ax.set_title("Vector samples from Gauusian distribution")
    ax.set_xlabel("Index")
    ax.set_ylabel("Values")
    for i,e in enumerate(quantiles):
        ax.axhline(y = norm.ppf(e), color = "red")
    if path:
        plt.savefig(path ,dpi = 300, bbox_inches='tight')
    else:
        plt.show()
def generate(estimator):
    from scipy.stats import norm

    n = 15  # Figure row size
    figure = np.zeros((28 * n, 28 * n))
    # Random normal distributions to feed network with
    x_axis = norm.ppf(np.linspace(0.05, 0.95, n))
    y_axis = norm.ppf(np.linspace(0.05, 0.95, n))

    samples = []
    for i, x in enumerate(x_axis):
        for j, y in enumerate(y_axis):
            samples.append(np.array([x, y], dtype=np.float32))

    samples = np.array(samples)
    x_reconstructed = estimator.generate(
        plx.processing.numpy_input_fn({'samples': samples}, batch_size=n * n, shuffle=False))

    results = [x['results'] for x in x_reconstructed]
    for i, x in enumerate(x_axis):
        for j, y in enumerate(y_axis):
            digit = results[i * n + j].reshape(28, 28)
            figure[i * 28: (i + 1) * 28, j * 28: (j + 1) * 28] = digit

    try:
        import matplotlib.pyplot as plt

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        plt.show()
    except ImportError:
        pass
	def get_NGS(audio):

		"""
		Still need to test with Gauss vs Gauss
		Should be 0 I think. That means the Non-Gaussianity is low
		because it is actually a Guassian distribition


		This isnt correct - producing a NGS of 0.999something
		for almost all coughs. defs not right. need to test
		"""

		PLOT = 0

		# Number of samples in segment
		N = len(audio)

		# first get inverse of CDF for normal distribution
		# N point guassian distribution
		gauss_sorted = np.sort(np.random.randn(N))
		p_gauss = 1. * np.arange(N) / (N-1)
		g = norm.ppf(p_gauss)[1:-2]

		mu = np.mean(audio)
		sig = np.std(audio)

		data_sorted = np.sort(audio)

		# Get the CDF of the audio data
		p = 1. * np.arange(N) / (N-1)
		# Get the inverse of the CDF
		# throw away the first and last elements because
		# they are -inf and inf
		gamma = norm.ppf(p,loc = mu,scale=sig)[1:-2]

		NGS = 1.0 - ( np.sum((g - np.mean(g)**2)) / np.sum(gamma - np.mean(gamma)**2) ) 

		if PLOT == 1:
			plt.figure(1)
			plt.subplot(2,1,1)
			plt.plot(gauss_sorted,p_gauss)
			plt.ylabel("$p$")
			plt.xlabel("$x$")
			plt.subplot(2,1,2)
			plt.plot(p_gauss[1:-2],g)
			plt.xlabel("$p$")
			plt.ylabel("")
			plt.show()

			plt.figure(2)
			plt.title("CDF and PPF with NGS = %f"%NGS)
			plt.subplot(2,1,1)
			plt.plot(data_sorted,p)
			plt.ylabel("$p$")
			plt.xlabel("$x$")
			plt.subplot(2,1,2)
			plt.plot(p[1:-2],gamma)
			plt.show()

		return NGS
def get_power(effect_size, N, p1, p2, significance, two_sided):
    # assumption 1: n1=n2
    # assumption 2: one-sided test

    p2 = p1 - effect_size

    # Our random var is the difference between event rate p1 and event rate p2.
    # So the variance of our random variable is Var[x] = Var[p1] + Var[p2]
    sigma = np.sqrt(p1*(1-p1) + p2*(1-p2))

    if two_sided:
        Z_crit = norm.ppf((1- (1-significance)/2 ))
    else:
        Z_crit = norm.ppf((1- (1-significance) ))


    # Note: our random var is the difference between control and test, so for every pair of
    # control/test observations, we have only one observation for our rand var. Thus, use n_control
    # or n_test but do not use n_total. Hence the N/2 sizes in the formulas below.
    if two_sided:
        power2 = 1 - norm.cdf(Z_crit - effect_size * np.sqrt(N/2)/sigma) + norm.cdf(-Z_crit - effect_size * np.sqrt(N/2)/sigma)
    else:
        power2 = 1 - norm.cdf(Z_crit - effect_size * np.sqrt(N/2)/sigma)

    return power2, Z_crit
Example #14
0
 def rank(self, x, y):
     cnts = y.value_counts()
     scores = []
     def e(x, y):
         return -x / (x + y) * math.log(x / (x + y)) - y / (x + y) * math.log(y / (x + y))
     for c in x.columns:
         true_positives = float(np.count_nonzero(np.logical_and(x[c], y)))
         false_positives = float(np.count_nonzero(np.logical_and(x[c], np.logical_not(y))))
         pos = float(cnts[1])
         neg = float(cnts[0])
         n = pos + neg
         if self.type == 'bns':
             tpr = max(0.0005, true_positives / pos)
             fpr = max(0.0005, false_positives / neg)
             tpr = min(.9995, tpr)
             fpr = min(.9995, fpr)
             score = abs(norm.ppf(tpr) - norm.ppf(fpr))
         elif self.type == 'acc':
             score = abs(tpr - fpr)
         elif self.type == 'ig':
             score = e(pos, neg) - ( (true_positives + false_positives) / n * e(true_positives, false_positives)
                 + (1 - (true_positives + false_positives) / n) * e(pos - true_positives, neg - false_positives))
         scores.append((score, c))
     scores.sort(reverse=True)
     return scores
Example #15
0
def binormal_separation(matrix, index):
    """
    Calculates the binormal separation between a segment extension and each words
    it contains.

    Parameters
    ----------

    matrix: karl.Matrix
        The matrix in which the association is to be calculated

    index: list of booleans or list of integers
        Indexes segments which form the extension with which the association is
        to be measured.

    """

    a,b,c,d, unifs = _get_abcd(matrix, index)

    tpr = a / np.array(a + c, dtype=float)
    fpr = b / np.array(b + d, dtype=float)

    BNS = abs(norm.ppf(tpr)-norm.ppf(fpr))

    return sorted(zip(BNS, unifs), reverse=True)
def dualDigiPriceAnalytics(p1, p2):
    
    start = time()
    x1k = norm.ppf(1 - p1)
    x2k = norm.ppf(1 - p2)
    
    def jointDensity(y, x, rho):
        scale = 1 / (2 * sp.pi * sp.sqrt(1 - rho * rho))
        expo = sp.exp(-(y ** 2 + x ** 2 - 2 * rho * y * x) / (2 * (1 - rho ** 2)))
        return (scale * expo)
      
    calPrice = lambda rho : dblquad(jointDensity, x2k, sp.inf, lambda x : x1k, lambda x : sp.inf, args=(rho,))[0]

    rhoVec = np.linspace(-.99, .99, num=61)

    opsPrice = [calPrice(rho) for rho in rhoVec]
    
    end = time()

    print "time: ", end - start
    plt.plot(rhoVec, opsPrice)
    
    plt.title("Dual digital option price and correlation rho")
    plt.xlabel("Correlation (rho)")
    plt.ylabel("Option Price ($)")
    
    plt.show()
    

    return
	def test (self):
		if self.n > 10:
			self._set_S()
			self._computer_var_S()

			if self.S > 0:
				z = (self.S - 1)/np.sqrt(self.var_S)
			elif self.S == 0:
				z = 0
			elif self.S < 0:
				z = (self.S + 1)/np.sqrt(self.var_S)
		    
		    # calculate the p_value
			p = 2*(1-norm.cdf(abs(z)))
			h = abs(z) > norm.ppf(1-self.alpha/2)
			if h:
				if z >= norm.ppf(1-self.alpha):
					m='+'
				elif z <= norm.ppf(1-self.alpha):
					m='-'
			else:
				m=None
			return h,m,p
		else:
			print 'Test can only be run on a series of more than 10'
			return None,None,None
Example #18
0
def sampleSize_twoGroups(d, alpha=0.05, beta=0.2, sigma1=1, sigma2=1):
    '''Sample size for two groups.'''
    
    n = round((norm.ppf(1-alpha/2.) + norm.ppf(1-beta))**2 * (sigma1**2 + sigma2**2) / d**2)
    
    print('In order to detect a change of {0} between groups with an SD of {1} and {2},'.format(d, sigma1, sigma2))
    print('with significance {0} and test-power {1}, you need in each group at least {2:d} subjects.'.format(alpha, 100*(1-beta), int(n)))
Example #19
0
def inverse_local(local_prob, hyper):
	n_group  = len(local_prob) /3
	a = norm.ppf(local_prob[0:n_group], hyper[0], scatter)
	b = norm.ppf(local_prob[n_group:2*n_group], hyper[1], scatter)
	c = norm.ppf(local_prob[2*n_group:], hyper[2], scatter)
	local = np.hstack((a,b,c))
	return local
    def real_position_to_abstract(self, p0):
        """

        :param p0:
        :return:
        """
        return [norm.ppf((p0[0] + 1500) / 3000.), norm.ppf(p0[1] / 2000.)]
Example #21
0
def sampleSize_oneGroup(d, alpha=0.05, beta=0.2, sigma=1):
    '''Sample size for a single group.'''
    
    n = round((norm.ppf(1-alpha/2.) + norm.ppf(1-beta))**2 * sigma**2 / d**2)
    
    print('In order to detect a change of {0} in a group with an SD of {1},'.format(d, sigma))
    print('with significance {0} and test-power {1}, you need at least {2:d} subjects.'.format(alpha, 100*(1-beta), int(n)))
Example #22
0
def bcpcl(T,T_p,N_sigma):
    '''
    Calculates the bias corrected percent confidence limits.
    -- Suppose that we have observed data (y1, y2, ..., yn) and use it to estimate a population parameter Q (e.g. Q could be the true mean of the entire population).
    -- T is a statistic that estimates Q. For example T could be an estimate of the true mean by calculating the mean of  (y1, y2, ..., yn).
    -- Suppose that we create m bootstrap samples (y_p_1j, y_p_2j, ...,j_p_nj) from observed sample  (y1, y2, ..., yn), where j is the jth bootstrap sample.
    -- Then T_p_j is the jth bootstrap observation of T.  For example this could be the mean of (y_p_1j, y_p_2j, ...,j_p_nj).
    
    T = [float] e.g. biweight Location for (y1, y2, ..., yn)
    T_p = [vector array] biwieght Locations for the bootstrap samples
    N_sigma = the number of sigma to report the confidence limits for
        e.g. for 95% confidence limits N_sigma=2
    Return (lower, upper) confidence limits
    '''
    #Percentile confidence interval is defined as 100%(1-a), thus for 1sigma a=0.32
    a = 1-erf(N_sigma/numpy.sqrt(2))
    #order the bootstrap sample values smallest to largest
    index = numpy.argsort(T_p)
    T_p = T_p[index]
    #Number of bootstrap samples
    m = numpy.size(T_p)        
    #Calculate the bias correction term
    mask = T_p < T
    z_0 = norm.ppf(numpy.sum(mask)/m)
    #Calculate the a1 and a2 values
    a1 = norm.cdf(2*z_0+norm.ppf(a/2))
    a2 = norm.cdf(2*z_0+norm.ppf(1-a/2))
    #Calculate the lower and upper indicies of lower and upper confidence intervals
    id_L = numpy.int(m*a1)-1
    id_U = numpy.int(m*a2)
    #Find the lower an upper confidence values
    T_L = T_p[id_L]
    T_U = T_p[id_U]
    return T_L, T_U
Example #23
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('model', help='model file to load')
    parser.add_argument('dset', choices=['mnist'])
    args = parser.parse_args()

    with open(args.model, 'rb') as f:
        model = pickle.load(f)
    if args.dset == 'mnist':
        S = (28, 28)
        M = 20

    manifold = np.zeros((S[0]*M, S[1]*M), dtype=theano.config.floatX)

    for z1 in xrange(M):
        for z2 in xrange(M):
            print z1, z2
            z = np.zeros((1, 2))
            # pass unit square through inverse Gaussian CDF
            z[0, 0] = norm.ppf(z1 * 1.0/M + 1.0/(M * 2))
            z[0, 1] = norm.ppf(z2 * 1.0/M + 1.0/(M * 2))
            z = np.array(z, dtype=theano.config.floatX)
            x_hat = model.decode(z)
            x_hat = x_hat.reshape(S)
            manifold[z1 * S[0]:(z1 + 1) * S[0],
                     z2 * S[1]:(z2 + 1) * S[1]] = x_hat

    plt.imshow(manifold, cmap='Greys_r')
    plt.axis('off')
    plt.show()
Example #24
0
def z_effect(counts, power, alpha=0.05):
    """Estimates the effect size for power based on the z distribution

    This is based on the equations in
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.
    The equation assumes a positive magnitude to the effect size and a
    two-tailed test.

    Parameters
    ----------
    counts : array
        The number of observations for each power depth
    power : array
        The statistical power at the depth specified by `counts`
    alpha : float, optional
        The critial value used to calculate the power

    Returns
    -------
    ndarray
        A standard measure of the difference between the underlying
        populations

    """
    power = np.atleast_2d(power)
    z_diff = z.ppf(power) + z.ppf(1 - alpha/2)
    eff = np.sqrt(np.square(z_diff) / counts)

    eff[power == 1] = np.nan
    eff[np.isinf(eff)] = np.nan

    return eff
Example #25
0
def cmc(g, xdists, u_to_x, T, seed, maxitr):
    """
    Crude Monte Carlo simulation.
    """

    # Seed the random number generator if required
    if seed == -1:
        prng = RandomState()
    else:
        prng = RandomState(seed)
    
    # Generate standard normal samples centered at the origin
    u0 = zeros(len(xdists))
    covmat = eye(len(xdists)) 
    u = prng.multivariate_normal(u0, covmat, size=maxitr).T
    g_mc = g(u_to_x(u, xdists, T))

    # Convert g-function output to pass/fail indicator function and estimate pf
    g_mc[g_mc>0] = 0
    g_mc[g_mc<0] = 1
    mu_pf = g_mc.mean()
    beta = -norm.ppf(mu_pf) if mu_pf < 0.5 else norm.ppf(mu_pf)

    # Convergence metrics (standard deviation, standard error, CoV of s.e.)
    std_pf = g_mc.std(ddof=1) # Calculate sample standard deviation
    se_pf = std_pf/sqrt(maxitr)
    cv_pf = se_pf/mu_pf

    return {'vars': xdists, 'beta': beta, 'Pf': mu_pf, 'stderr': se_pf, 
            'stdcv': cv_pf}
Example #26
0
def mediation(var1,var2,var3, alpha=0.05, n_samples=10000, type='pearson', epsilon=0.001):
	if type.lower() == "pearson": statfunction = mediation_pearson
	if type.lower() == "kendalltau": statfunction = mediation_kendalltau
	if type.lower() == "spearman": statfunction = mediation_spearman
	# Deal with the alpha values
	if np.iterable(alpha):
		alphas = np.array(alpha)
	else:
		alphas = np.array([alpha/2,1-alpha/2])
	data = (var1,var2,var3)
	# Ensure that the data is actually an array. This isn't nice to pandas,
	#data = tuple( np.array(x) for x in data )

	# We don't need to generate actual samples; that would take more memory.
	# Instead, we can generate just the indexes, and then apply the statfun
	# to those indexes.
	bootindexes = bootstrap_indexes(len(var1), n_samples )
	stat = np.array([statfunction(*(x[indexes] for x in data)) for indexes in bootindexes])
	stat.sort(axis=0)

	# Bias-Corrected Accelerated Method


	# The value of the statistic function applied just to the actual data.
	ostat = statfunction(*data)

	# The bias correction value.
	z0 = norm.ppf( ( 1.0*np.sum(stat < ostat, axis=0)  ) / n_samples )

	# Statistics of the jackknife distribution
	jackindexes = jackknife_indexes(data[0])
	jstat = [statfunction(*(x[indexes] for x in data)) for indexes in jackindexes]
	jmean = np.mean(jstat,axis=0)

	# Acceleration value
	a = np.sum( (jmean - jstat)**3, axis=0 ) / ( 6.0 * np.sum( (jmean - jstat)**2, axis=0)**1.5 )

	zs = z0 + norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim)

	avals = norm.cdf(z0 + zs/(1-a*zs))


	nvals = np.round((n_samples-1)*avals)
	
	if np.any(nvals==0) or np.any(nvals==n_samples-1):
		warnings.warn("Some values used extremal samples; results are probably unstable.", InstabilityWarning)
	elif np.any(nvals<10) or np.any(nvals>=n_samples-10):
		warnings.warn("Some values used top 10 low/high samples; results may be unstable.", InstabilityWarning)

	nvals = np.nan_to_num(nvals).astype('int')


	if nvals.ndim == 1:
		# All nvals are the same. Simple broadcasting
		return {"Estimate":statfunction(*data),"%.2f%% Confidence Interval" % round(1- alpha,2): stat[nvals]}
	else:
		# Nvals are different for each data point. Not simple broadcasting.
		# Each set of nvals along axis 0 corresponds to the data at the same
		# point in other axes.
		return {"Estimate":statfunction(*data),"%.2f%% Confidence Interval" % round(1- alpha,2): stat[(nvals, np.indices(nvals.shape)[1:].squeeze())]}
Example #27
0
def est_sdt(f, h, m, r, rule='yn'):
    """Calculate maximum-likelihood estimates of sensitivity and bias.

    Args:
        f: False alarms.
        h: Hits.
        m: Misses.
        r: Correct rejections.
        rule: Name of decision rule.

    Returns:
        [(d1, c1) ...]

    """
    out = []
    for _f, _h, _m, _r in zip(f, h, m, r):
        n0, n1 = float(_f + _r), float(_h + _m)
        if _f == 0:
            _f += 0.5
        if _f == n0:
            _f -= 0.5
        if _h == 0:
            _h += 0.5
        if _h == n1:
            _h -= 0.5
        fhat = _f / float(n0)
        hhat = _h / float(n1)
        d = norm.ppf(hhat) - norm.ppf(fhat)
        c = -0.5 * (norm.ppf(hhat) + norm.ppf(fhat))
        if rule == '2afc':
            d /= np.sqrt(2)
            c /= np.sqrt(2)
        out.append((d, c))
    return out
Example #28
0
def q2qnbinom(counts, input_mean, output_mean, dispersion):
    """ Quantile to Quantile for a negative binomial
    """
    zero = logical_or(input_mean < 1e-14, output_mean < 1e-14)
    input_mean[zero] = input_mean[zero] + 0.25
    output_mean[zero] = output_mean[zero] + 0.25
    ri = 1 + multiply(np.matrix(dispersion).T, input_mean)
    vi = multiply(input_mean, ri)
    rO = 1 + multiply(np.matrix(dispersion).T, output_mean)
    vO = multiply(output_mean, rO)
    i = counts >= input_mean
    low = logical_not(i)
    p1 = empty(counts.shape, dtype=np.float64)
    p2 = p1.copy()
    q1, q2 = p1.copy(), p1.copy()
    if i.any():
        p1[i] = norm.logsf(counts[i], loc=input_mean[i], scale=np.sqrt(vi[i]))[0, :]
        p2[i] = gamma.logsf(counts[i], (input_mean / ri)[i], scale=ri[i])[0, :]
        q1[i] = norm.ppf(1 - np.exp(p1[i]), output_mean[i], np.sqrt(vO[i]))[0, :]
        q2[i] = gamma.ppf(1 - np.exp(p2[i]), np.divide(output_mean[i], rO[i]), scale=rO[i])[0, :]

    if low.any():
        p1[low] = norm.logcdf(counts[low], loc=input_mean[low], scale=np.sqrt(vi[low]))[0, :]
        p2[low] = gamma.logcdf(counts[low], input_mean[low] / ri[low], scale=ri[low])[0, :]
        q1[low] = norm.ppf(np.exp(p1[low]), loc=output_mean[low], scale=np.sqrt(vO[low]))[0, :]
        q2[low] = gamma.ppf(np.exp(p2[low]), output_mean[low] / rO[low], scale=rO[low])[0, :]
    return (q1 + q2) / 2
def t_to_z(mr, dof):
  
  data = mr.get_data()

  # Select just the nonzero voxels
  nonzero = data[data!=0]

  # We will store our results here
  Z = np.zeros(len(nonzero))

  # Select values less than or == 0, and greater than zero
  c  = np.zeros(len(nonzero))
  k1 = (nonzero <= c)
  k2 = (nonzero > c)

  # Subset the data into two sets
  t1 = nonzero[k1]
  t2 = nonzero[k2]

  # Calculate p values for <=0
  p_values_t1 = t.cdf(t1, df = dof)
  z_values_t1 = norm.ppf(p_values_t1)

  # Calculate p values for > 0
  p_values_t2 = t.cdf(-t2, df = dof)
  z_values_t2 = -norm.ppf(p_values_t2)
  Z[k1] = z_values_t1
  Z[k2] = z_values_t2

  # Create new nifti
  empty_nii = np.zeros(mr.shape)
  empty_nii[mr.get_data()!=0] = Z
  Z_nii_fixed = nib.nifti1.Nifti1Image(empty_nii,affine=mr.get_affine(),header=mr.get_header())
  return Z_nii_fixed
def simulate():
    line = np.linspace(-100, 100, 201)
    X = norm.pdf(line, loc=5, scale=18)

    # Plotting the PDF and CDF of N(5,18) over the range of (-100, 100)
    pyplot.subplot(211)
    pyplot.plot(line, X)
    pyplot.title('PDF')
    CDF = np.cumsum(X)
    pyplot.subplot(212)
    pyplot.title('CDF')
    pyplot.plot(line, CDF)
    pyplot.show()

    # 1. P(X<8)
    print('P(X<8): ', norm.cdf(8, loc=5, scale=18))

    # 2. P(X>-2)
    print('P(X>-2): ', 1 - norm.cdf(-2, loc=5, scale=18))

    # 3. x such that P(X>x) = 0.05
    print('x such that P(X>x) = 0.05: ', norm.ppf(0.95, loc=5, scale=18))

    # 4. P(0<=X<4)
    print('P(0<=X<4: ', norm.cdf(4, loc=5, scale=18) - norm.cdf(0, loc=5, scale=18))

    # 5. x such that P(abs(X) > abs(x)) = 0.05
    print('x such that P(abs(X) > abs(x)) = 0.05: ', norm.ppf(0.975, loc=5, scale=18))
Example #31
0
import numpy as np
from scipy.stats import norm
from scipy import special

# Standard (Normal) Gaussian
mu, sigma = 0, 1


def gaussian(x, mu, sigma):
    normal = (1.0 / np.sqrt(2 * np.pi * sigma**2))
    distribution = np.exp(-(x - mu)**2 / (2 * sigma**2))
    return normal * distribution


# generate range of x values from -2.3 to 2.3
x = np.linspace(norm.ppf(0.01) * 2, norm.ppf(0.99) * 2, 100)
# generate pdf (standard normal distribution)
pdf = gaussian(x, mu, sigma)

# Central Limit Theorem
# Generate Samples from a Normal Distribution
mu_original, sigma_original = 5, 10
sampleSizes = [100, 1000, 10000]
fig = 1
for sample in sampleSizes:
    trails = np.arange(1, sample)
    # need to generate CLT statistic
    statistic = []
    for n in trails:
        # s can be generated from any distribution
        s = (np.sum(np.random.normal(mu_original, sigma_original, n)) - n*mu_original)  / \
def hall_sheather(n, q, alpha=.05):
    z = norm.ppf(q)
    num = 1.5 * norm.pdf(z)**2.
    den = 2. * z**2. + 1.
    h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3)
    return h
def bofinger(n, q):
    num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4
    den = (2 * norm.ppf(q)**2 + 1)**2
    h = n**(-1. / 5) * (num / den)**(1. / 5)
    return h
    tempReturns = np.log(
        tempPrices.iloc[1:, 1:4].astype(float).dropna() /
        tempPrices.iloc[1:, 1:4].shift(1).astype(float).dropna())
    tempReturnsLast = tempReturns.iloc[-1, :]
    cov_matrix = tempReturns.cov()
    #    tempReturns[tempReturns['Brent].cov()
    mean_returns = tempReturns.mean()
    tempRev = revenue[revenue['Date'] == date].reset_index(drop=True)
    weights = np.array(
        [tempRev['wtiWeight'], tempRev['brentWeight'], tempRev['gasWeight']])
    port_mean = mean_returns.dot(weights).item() * 252
    portMean.append(port_mean)
    port_stdev = np.sqrt(weights.T.dot(cov_matrix).dot(weights)).item()
    portStd.append(port_stdev)
    #one_day_var = norm.ppf(conf_level, port_mean, port_stdev)
    oneDayvar.append(norm.ppf(conf_level, port_mean, port_stdev))
    annualVar.append(
        norm.ppf(conf_level, port_mean, port_stdev) * np.sqrt(252))
    portReturn.append(tempReturnsLast.dot(weights).item())

    #Component VaR
    #tempReturns.dot(weights).cumprod().plot()
    #.Portfolio.plot()

df = pd.DataFrame({
    'dateId': dateId,
    'portMean': portMean,
    'portStd': portStd,
    'oneDayvar': oneDayvar,
    'annualVar': annualVar,
    'portReturn': portReturn
Example #35
0
import matplotlib.pyplot as plt

# Constants
EQUITY_INDEX_CUTOFFS = [0, 0.03, 0.07, 0.1, 0.15, 0.3, 1]
# Parameters
n = 1000  #names in credit index
rho = 0.1
num_sims = 1000
prob_default = 0.25
# For equity tranche 0-20%, mezzanine 20%-80%:
# tranche_cutoffs = [0, 0.2, 1 ]
tranche_cutoffs = EQUITY_INDEX_CUTOFFS
tranche_to_watch = 4  #1 is equity, 2 mezz, etc

# derived parameters
z_score_of_default = norm.ppf(prob_default)
beta = rho**0.5
alpha = (1 - rho)**0.5
max_defaults_protected = int(round(n * tranche_cutoffs[tranche_to_watch - 1]))
wiped_out_defaults = int(round(n * tranche_cutoffs[tranche_to_watch]))
names_in_tranche = wiped_out_defaults - max_defaults_protected

# run simulation
trial_results = []
names_remaining_in_tranche = []
for _ in range(num_sims):
    M = random.gauss(0, 1)
    K = 0  # number of names defaulting
    for _ in range(n):
        R_i = beta * M + alpha * random.gauss(0, 1)
        if R_i < z_score_of_default:
Example #36
0
def approx_exp_max_sharpe(mean_sharpe, var_sharpe, nb_trials):
    """Expected Maximum Sharpe Ratio."""
    return mean_sharpe + np.sqrt(var_sharpe) * \
        ((1 - np.euler_gamma) * norm.ppf(1 - 1 / nb_trials) + np.euler_gamma * norm.ppf(1 - 1 / (nb_trials * np.e)))
Example #37
0
    def estimate_ate(self, X, p, treatment, y, segment=None, return_ci=False):
        """Estimate the Average Treatment Effect (ATE).

        Args:
            X (np.matrix or np.array or pd.Dataframe): a feature matrix
            p (np.ndarray or pd.Series or dict): an array of propensity scores of float (0,1) in the single-treatment
                case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            treatment (np.array or pd.Series): a treatment vector
            y (np.array or pd.Series): an outcome vector
            segment (np.array, optional): An optional segment vector of int. If given, the ATE and its CI will be
                                          estimated for each segment.
            return_ci (bool, optional): Whether to return confidence intervals

        Returns:
            (tuple): The ATE and its confidence interval (LB, UB) for each treatment, t and segment, s
        """
        check_treatment_vector(treatment, self.control_name)
        X, treatment, y = convert_pd_to_np(X, treatment, y)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()

        check_p_conditions(p, self.t_groups)
        if isinstance(p, np.ndarray):
            treatment_name = self.t_groups[0]
            p = {treatment_name: convert_pd_to_np(p)}
        elif isinstance(p, dict):
            p = {
                treatment_name: convert_pd_to_np(_p)
                for treatment_name, _p in p.items()
            }

        ate = []
        ate_lb = []
        ate_ub = []

        for i, group in enumerate(self.t_groups):
            logger.info('Estimating ATE for group {}.'.format(group))
            w_group = (treatment == group).astype(int)
            p_group = p[group]

            if self.calibrate_propensity:
                logger.info('Calibrating propensity scores.')
                p_group = calibrate(p_group, w_group)

            yhat_c = np.zeros_like(y, dtype=float)
            yhat_t = np.zeros_like(y, dtype=float)
            if self.cv:
                for i_fold, (i_trn,
                             i_val) in enumerate(self.cv.split(X, y), 1):
                    logger.info(
                        'Training an outcome model for CV #{}'.format(i_fold))
                    self.model_tau.fit(
                        np.hstack((X[i_trn], w_group[i_trn].reshape(-1, 1))),
                        y[i_trn])

                    yhat_c[i_val] = self.model_tau.predict(
                        np.hstack((X[i_val], np.zeros((len(i_val), 1)))))
                    yhat_t[i_val] = self.model_tau.predict(
                        np.hstack((X[i_val], np.ones((len(i_val), 1)))))

            else:
                self.model_tau.fit(np.hstack((X, w_group.reshape(-1, 1))), y)

                yhat_c = self.model_tau.predict(
                    np.hstack((X, np.zeros((len(y), 1)))))
                yhat_t = self.model_tau.predict(
                    np.hstack((X, np.ones((len(y), 1)))))

            if segment is None:
                logger.info('Training the TMLE learner.')
                _ate, se = simple_tmle(y, w_group, yhat_c, yhat_t, p_group)
                _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2)
                _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2)
            else:
                assert segment.shape[0] == X.shape[
                    0] and segment.ndim == 1, 'Segment must be the 1-d np.array of int.'
                segments = np.unique(segment)

                _ate = []
                _ate_lb = []
                _ate_ub = []
                for s in sorted(segments):
                    logger.info(
                        'Training the TMLE learner for segment {}.'.format(s))
                    filt = (segment
                            == s) & (yhat_c < np.quantile(yhat_c, q=.99))
                    _ate_s, se = simple_tmle(y[filt], w_group[filt],
                                             yhat_c[filt], yhat_t[filt],
                                             p_group[filt])
                    _ate_lb_s = _ate_s - se * norm.ppf(1 - self.ate_alpha / 2)
                    _ate_ub_s = _ate_s + se * norm.ppf(1 - self.ate_alpha / 2)

                    _ate.append(_ate_s)
                    _ate_lb.append(_ate_lb_s)
                    _ate_ub.append(_ate_ub_s)

            ate.append(_ate)
            ate_lb.append(_ate_lb)
            ate_ub.append(_ate_ub)

        return np.array(ate), np.array(ate_lb), np.array(ate_ub)
def mk_test(x, time, confidence_interval=False, alpha=0.05):
    """
    This function is derived from code originally posted by Sat Kumar Tomer
    ([email protected])
    See also: http://vsp.pnnl.gov/help/Vsample/Design_Trend_Mann_Kendall.htm
    The purpose of the Mann-Kendall (MK) test (Mann 1945, Kendall 1975, Gilbert
    1987) is to statistically assess if there is a monotonic upward or downward
    trend of the variable of interest over time. A monotonic upward (downward)
    trend means that the variable consistently increases (decreases) through
    time, but the trend may or may not be linear. The MK test can be used in
    place of a parametric linear regression analysis, which can be used to test
    if the slope of the estimated linear regression line is different from
    zero. The regression analysis requires that the residuals from the fitted
    regression line be normally distributed; an assumption not required by the
    MK test, that is, the MK test is a non-parametric (distribution-free) test.
    Hirsch, Slack and Smith (1982, page 107) indicate that the MK test is best
    viewed as an exploratory analysis and is most appropriately used to
    identify stations where changes are significant or of large magnitude and
    to quantify these findings.
    Input:
        x:   a vector of data
        alpha: significance level (0.05 default)
    Output:
        trend: tells the trend (increasing, decreasing or no trend)
        h: True (if trend is present) or False (if trend is absence)
        p: p value of the significance test
        z: normalized test statistics
    Examples
    --------
      >>> x = np.random.rand(100)
      >>> trend,h,p,z = mk_test(x,0.05)
    """
    n = len(x)

    # calculate S
    s = 0
    N = int(n * (n - 1) / 2)
    q = np.zeros(N)
    ii = 0
    for k in range(n - 1):
        for j in range(k + 1, n):
            s += np.sign(x[j] - x[k])
            q[ii] = (x[j] - x[k]) / (time[j] - time[k])
            ii = ii + 1

    # calculate the unique data
    unique_x = np.unique(x)
    g = len(unique_x)

    sort_q = np.sort(q)
    slope = np.median(sort_q)

    # calculate the var(s)
    if n == g:  # there is no tie
        var_s = (n * (n - 1) * (2 * n + 5)) / 18
    else:  # there are some ties in data
        tp = np.zeros(unique_x.shape)
        for i in range(len(unique_x)):
            tp[i] = sum(x == unique_x[i])
        var_s = (n * (n - 1) * (2 * n + 5) - np.sum(tp * (tp - 1) *
                                                    (2 * tp + 5))) / 18

    if s > 0:
        z = (s - 1) / np.sqrt(var_s)
    elif s < 0:
        z = (s + 1) / np.sqrt(var_s)
    else:  # s == 0:
        z = 0

    # calculate the p_value
    p = 2 * (1 - norm.cdf(abs(z)))  # two tail test
    h = abs(z) > norm.ppf(1 - alpha / 2)

    if (s < 0) and h:
        trend = 'decreasing'
    elif (s > 0) and h:
        trend = 'increasing'
    else:
        trend = 'no trend'


#    # confidence interva
#    stats = np.random.choice(sort_q, (len(sort_q), 100), replace=True)
#    p1 = ((1.0 - alpha)/2.0) * 100
#    lower = max(0.0, np.percentile(stats, p1))
#    p2 = (alpha+((1.0-alpha)/2.0)) * 100
#    upper = min(1.0, np.percentile(stats, p2))
#    print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

    std_conf = 0
    if (confidence_interval == True):
        bias, std, rms, bias_conf, std_conf, rms_conf = bootstr_confidence(
            sort_q, 100)

    return trend, h, p, z, slope, std_conf
Example #39
0
    def fit_metric_model(self):
        logger.info("start computing metric model...")

        ### Load the results
        df_results = self.result_reader.load_all_results(aggregate=True)

        self._nb_models_done = len(df_results)
        if self._nb_models_done <= self.min_nb_of_models:
            return self

        if (self._nb_models_done is not None
                and len(df_results) == self._nb_models_done
                and self.params_training_columns is not None):
            return self

        ### Load the params
        df_params = self.result_reader.load_all_params()

        df_merged_result = pd.merge(df_params,
                                    df_results,
                                    how="inner",
                                    on="job_id")

        training_cols = diff(list(df_params.columns), ["job_id"])

        # X dataframe for parameters
        dfX_params = df_merged_result.loc[:, training_cols]

        ### Retrive the target metric

        if self.avg_metrics:
            scorers = self.job_config.scoring
        else:
            scorers = [self.job_config.main_scorer
                       ]  # I'll use only the main_scorer

        N = dfX_params.shape[0]
        all_y_params = []
        for scorer in scorers:
            y_params = df_merged_result["test_%s" %
                                        scorer]  # Retrive the raw metric
            # replace NaN by scorer's observed minimum score ; if y_params contains
            # only NaN -> won't work
            y_params = y_params.fillna(y_params.min()).values

            if self.metric_transformation is None:
                pass

            elif self.metric_transformation == "rank":
                ### Transform in non-parametric rank ....
                y_params = kde_transfo_quantile(y_params)

                # => This behave likes a uniform law

            elif self.metric_transformation == "normal":
                ### Transform into non-parametric normal ...
                y_params = norm.ppf(kde_transfo_quantile(y_params))

                # => This behaves likes a normal law

            elif self.metric_transformation == "default":
                ### Transform using default transformation (log like function)
                f = get_metric_default_transformation(scorer)
                y_params = f(y_params)

                if self.avg_metrics:
                    # If I'm averaging I'd rather have something centered
                    y_params = (y_params -
                                np.mean(y_params)) / np.std(y_params)

            else:
                raise ValueError("I don't know this metric_transformation %s" %
                                 self.metric_transformation)

            all_y_params.append(y_params.reshape((N, 1)))

        if len(all_y_params) > 1:
            y_params = np.concatenate(all_y_params, axis=1).mean(axis=1)
        else:
            y_params = all_y_params[0].reshape((N, ))

        #        elif self.metric_transformation
        #
        #
        #        else:
        #            # On peut aussi utiliser la transformation par default ?
        #            scorer = self.job_config.main_scorer
        #            y_params = df_merged_result["test_%s" % scorer].values
        #

        # create model
        transformer_model = GraphPipeline(models={
            "encoder": NumericalEncoder(),
            "imputer": NumImputer()
        },
                                          edges=[("encoder", "imputer")])

        xx_params = transformer_model.fit_transform(dfX_params)

        random_forest = RandomForestRegressor(n_estimators=100,
                                              min_samples_leaf=5)

        random_forest.fit(xx_params, y_params)

        random_forest_variance = RandomForestVariance(random_forest)
        random_forest_variance.fit(xx_params, y_params)

        self.params_training_columns = training_cols
        self.transformer_model = transformer_model
        self.random_forest = random_forest
        self.random_forest_variance = random_forest_variance

        self._nb_models_done = len(df_results)

        logger.info("metric model fitted")

        return self
# X ~ N(mean, variance)
#############################
from scipy.stats import norm

mean = loc = 3            # loc
std_variance = scale = 2    # scale
x = 2.5

pdf_value = norm.pdf(x, loc, scale)
print(f"When X ~ N({loc}, {scale}^2),\t pdf(X = {x}) = {pdf_value}")
cdf_value = norm.cdf(x, loc, scale)
print(f"When X ~ N({loc}, {scale}^2),\t cdf(X <= {x}) = {cdf_value}")

# ppf: percentage point function (inverse function of cdf)
p = 0.25
ppf_value = norm.ppf(p, loc, scale)
print(f"When X ~ N({loc}, {scale}^2),\t ppf(p = {p}) = {ppf_value}")
print(f"When X ~ N({loc}, {scale}^2),\t IQR = [{norm.ppf(0.25, loc, scale)}, {norm.ppf(0.75, loc, scale)}]")

# rvs : random variates
sample_size = 10
print(f"Random Variates (size :{sample_size}) from X ~ N({loc}, {scale}^2)\n", norm.rvs(loc,scale, size=sample_size))
print()
#%%
#############################
# Gamma Distribution
# X ~ Gamma(k, theta)   but in scipy, theta = 1
# f(x;k, theta) = x**(k-1) * exp(-x / theta) / theta ** k / gamma_function(k)
#############################
from scipy.stats import gamma
k = 1
def absPortf_HistVaR(listofInv=dfTime,
                     numDays=2,
                     MoneyVol=1000,
                     startDt='22/11/2016',
                     endDt='01/12/2016'):
    """
    Variance-Covariance calculation of daily Value-at-Risk
    using confidence level c, with mean of returns mu
    and standard deviation of returns sigma, on a portfolio
    of value P.
    """
    ##startDate = pd.to_datetime('22/11/2016' )
    startDate = np.datetime64(datetime.datetime.strptime(startDt, '%d/%m/%Y'))
    endDate = np.datetime64(datetime.datetime.strptime(endDt, '%d/%m/%Y'))

    ##remove Timestamp, Trade Open, Trade Close
    listofInv = pd.DataFrame(listofInv.ix[1:, 0:])

    listofInv['Date'] = pd.to_datetime(listofInv['Date'])
    listofInv = pd.DataFrame(listofInv)
    listofInv[(listofInv.Date >= startDate) & (listofInv.Date <= endDate)]

    set1 = pd.DataFrame()

    colListPrime = list(dfTime.columns.values)
    del colListPrime[0]
    colListPrime = colListPrime[::2]

    for i in range(1, len(colListFull), 2):
        set2 = (listofInv[listofInv.columns[i + 1]].astype(float)) - (
            listofInv[listofInv.columns[i]].astype(float))
        set1 = pd.concat([set1, set2], axis=1)
        del set2

    set1.columns = [colListPrime]
    #print(set1.head())

    SumSet = set1.groupby(
        (np.arange(len(set1.columns)) // len(set1.columns) * 10) + 1,
        axis=1).sum().add_prefix('sum')
    SumSet["rets"] = SumSet["sum1"].pct_change()

    SumSet.replace(np.NaN, 0, inplace=True)
    SumSet = SumSet.replace([np.inf, -np.inf], 0)
    SumSet = SumSet.astype(float)

    mu = np.mean(SumSet["rets"])
    sigma = np.std(SumSet["rets"])

    print(mu)
    print(sigma)
    #print "Value-at-Risk: $%0.2f" % var

    valueAtRisk_95 = MoneyVol - MoneyVol * (norm.ppf(0.05, mu, sigma) + 1)
    valueAtRisk_99 = MoneyVol - MoneyVol * (norm.ppf(0.01, mu, sigma) + 1)

    #Portf_stddev = (sum1+2*sum2) ** (0.5)
    print("Portfolio Historical Value at Risk with 95% confidence is: " +
          str(valueAtRisk_95))
    print("Portfolio Historical Value at Risk with 99% confidence is: " +
          str(valueAtRisk_99))
 def calculate_z_score(self):
     return norm.ppf(random.random())
Example #43
0
x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
plt.figure(figsize=(6, 6))
plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test_)
plt.colorbar()
# plt.show()
plt.savefig('x_test_encoded.jpg')

# 构建生成器
decoder_input = Input(shape=(latent_dim, ))
_h_decoded = decoder_h(decoder_input)
_x_decoded_mean = decoder_mean(_h_decoded)
generator = Model(decoder_input, _x_decoded_mean)
# 观察隐变量的两个维度变化是如何影响输出结果的
n = 15  # figure with 15x15 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))
#用正态分布的分位数来构建隐变量对
grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
grid_y = norm.ppf(np.linspace(0.05, 0.95, n))
for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]])
        x_decoded = generator.predict(z_sample)
        digit = x_decoded[0].reshape(digit_size, digit_size)
        figure[i * digit_size:(i + 1) * digit_size,
               j * digit_size:(j + 1) * digit_size] = digit
plt.figure(figsize=(10, 10))
plt.imshow(figure, cmap='Greys_r')
# plt.show()
plt.savefig('Greys_r.jpg')
def absPortf_VC_spec_VaR(listofInv=dfTMP,
                         numDays=2,
                         MoneyVol=1000,
                         startDt='2016-11-25',
                         endDt='2016-11-30'):
    """
    Variance-Covariance Method of computing VaR of specific 
    Stocks(separately, not as a Portfolio) over time frame.
    """
    #print(listofInv.head(9))
    colListFull = list(listofInv.columns.values)
    colListPrime = list(listofInv.columns.values)
    listRisk_95 = []
    listRisk_99 = []

    ##startDate = pd.to_datetime('22/11/2016' )
    startDate = np.datetime64(startDt)
    endDate = np.datetime64(endDt)

    ##remove Timestamp, Trade Open, Trade Close
    listofInv = pd.DataFrame(listofInv.ix[1:, 0:])

    listofInv = pd.DataFrame(listofInv)
    #listofInv['Date'] = [time.date() for time in listofInv['Date']]

    listofInv['Date'] = pd.to_datetime(listofInv['Date'], format='%Y-%m-%d')
    listofInv['Date'] = pd.to_datetime(listofInv['Date'])

    mask = (listofInv['Date'] >= startDate) & (listofInv['Date'] <= endDate)
    #print (mask)
    listofInv = listofInv.loc[mask]

    del colListFull[0]
    del colListPrime[0]
    #drop column from the Data frame
    listofInv = listofInv.drop('Date', 1)
    colListPrime = colListPrime[::2]

    #print(listofInv.head(10))
    #print(len(colListFull))

    for i in range(1, len(colListFull), 2):  ##changed to 1 from 2
        #print("i= " + str(i))
        #print("NOW col" + str(listofInv.ix[1:,(i):(i+1)].head(8)))

        tmp = pd.DataFrame(listofInv.ix[1:, (i):(i + 1)])
        tmp = tmp[pd.notnull(tmp[tmp.columns[(0)]])]
        #print("curr col name is :" + tmp.columns[0])
        ##tmp.columns[0]
        #print(tmp.head(7))

        tmp["rets" + str(tmp.columns[0])] = tmp[tmp.columns[0]].pct_change()

        mu = np.mean(tmp["rets" + str(tmp.columns[0])])
        sigma = np.std(tmp["rets" + str(tmp.columns[0])])
        #print(tmp.head(7))
        #print(mu)
        #print(sigma)
        valueAtRisk_95 = MoneyVol - MoneyVol * (norm.ppf(0.05, mu, sigma) + 1)
        valueAtRisk_99 = MoneyVol - MoneyVol * (norm.ppf(0.01, mu, sigma) + 1)

        listRisk_95.append(valueAtRisk_95)
        listRisk_99.append(valueAtRisk_99)

        #valueAtRisk_95 = MoneyVol - MoneyVol*(norm.ppf(0.05, mu, sigma) + 1)
        #valueAtRisk_99 = MoneyVol - MoneyVol*(norm.ppf(0.01, mu, sigma) + 1)

        #Portf_stddev = (sum1+2*sum2) ** (0.5)
        #print("Portfolio Historical Value at Risk with 95% confidence is: " + str(valueAtRisk_95))
        #print("Portfolio Historical Value at Risk with 99% confidence is: " + str(valueAtRisk_99))

        del tmp
    ##set1.columns = [colListPrime]
    ##print(set1.head())
    #print("VaR for a particular Investments:" + str(listRisk_95))
    setRisk_95 = pd.DataFrame(listRisk_95).T
    setRisk_99 = pd.DataFrame(listRisk_99).T

    setRisk_95.columns = [colListPrime]
    setRisk_99.columns = [colListPrime]

    setRisk_95 = setRisk_95**2
    setRisk_99 = setRisk_99**2
    #print(setRisk_95)

    setRisk_95['FINAL_95'] = setRisk_95.groupby(
        (np.arange(len(setRisk_95.columns)) // len(setRisk_95.columns) * 10) +
        1,
        axis=1).sum().add_prefix('sum')
    setRisk_95['FINAL_95'] = setRisk_95['FINAL_95']**0.5

    setRisk_99['FINAL_99'] = setRisk_99.groupby(
        (np.arange(len(setRisk_99.columns)) // len(setRisk_99.columns) * 10) +
        1,
        axis=1).sum().add_prefix('sum')
    setRisk_99['FINAL_99'] = setRisk_99['FINAL_99']**0.5

    setPortfVaR = pd.DataFrame()

    print(setRisk_95)
    print(setRisk_99)
Example #45
0
def interval_arcsin(x, alpha):
    u = np.abs(norm.ppf(alpha / 2))
    return u * np.sqrt(variance_of_arcsin(x))
Example #46
0
    def estimate_ate(self,
                     X,
                     p,
                     treatment,
                     y,
                     bootstrap_ci=False,
                     n_bootstraps=1000,
                     bootstrap_size=10000):
        """Estimate the Average Treatment Effect (ATE).

        Args:
            X (np.matrix): a feature matrix
            p (np.ndarray or dict): an array of propensity scores of float (0,1) in the single-treatment case
                                    or, a dictionary of treatment groups that map to propensity vectors of float (0,1)
            treatment (np.array): a treatment vector
            y (np.array): an outcome vector
            bootstrap_ci (bool): whether run bootstrap for confidence intervals
            n_bootstraps (int): number of bootstrap iterations
            bootstrap_size (int): number of samples per bootstrap
            verbose (str): whether to output progress logs

        Returns:
            The mean and confidence interval (LB, UB) of the ATE estimate.
        """
        te = self.fit_predict(X, p, treatment, y)

        check_p_conditions(p, self.t_groups)
        if isinstance(p, np.ndarray):
            treatment_name = self.t_groups[0]
            p = {treatment_name: p}

        ate = np.zeros(self.t_groups.shape[0])
        ate_lb = np.zeros(self.t_groups.shape[0])
        ate_ub = np.zeros(self.t_groups.shape[0])

        for i, group in enumerate(self.t_groups):
            w = (treatment == group).astype(int)
            prob_treatment = float(sum(w)) / X.shape[0]
            _ate = te[:, i].mean()

            se = (np.sqrt((self.vars_t[group] / prob_treatment) +
                          (self.vars_c[group] /
                           (1 - prob_treatment)) + te[:, i].var()) /
                  X.shape[0])

            _ate_lb = _ate - se * norm.ppf(1 - self.ate_alpha / 2)
            _ate_ub = _ate + se * norm.ppf(1 - self.ate_alpha / 2)

            ate[i] = _ate
            ate_lb[i] = _ate_lb
            ate_ub[i] = _ate_ub

        if not bootstrap_ci:
            return ate, ate_lb, ate_ub
        else:
            t_groups_global = self.t_groups
            _classes_global = self._classes
            model_mu_global = deepcopy(self.model_mu)
            models_tau_global = deepcopy(self.models_tau)

            logger.info('Bootstrap Confidence Intervals for ATE')
            ate_bootstraps = np.zeros(shape=(self.t_groups.shape[0],
                                             n_bootstraps))

            for n in tqdm(range(n_bootstraps)):
                cate_b = self.bootstrap(X,
                                        p,
                                        treatment,
                                        y,
                                        size=bootstrap_size)
                ate_bootstraps[:, n] = cate_b.mean()

            ate_lower = np.percentile(ate_bootstraps,
                                      (self.ate_alpha / 2) * 100,
                                      axis=1)
            ate_upper = np.percentile(ate_bootstraps,
                                      (1 - self.ate_alpha / 2) * 100,
                                      axis=1)

            # set member variables back to global (currently last bootstrapped outcome)
            self.t_groups = t_groups_global
            self._classes = _classes_global
            self.model_mu = deepcopy(model_mu_global)
            self.models_tau = deepcopy(models_tau_global)
            return ate, ate_lower, ate_upper
Example #47
0
def main():

    data = bmnist()[:2]  # ignore test split
    model = VAE(z_dim=ARGS.zdim)
    print('VAE parameter count:', sum(p.numel() for p in model.parameters()))

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
    writer = SummaryWriter('logs/log1')

    train_curve, val_curve = [], []
    for epoch in range(ARGS.epochs):
        #"""
        elbos = run_epoch(model, data, optimizer, writer)
        train_elbo, val_elbo = elbos
        writer.add_scalars('data/elbos', {
            'train elbo': train_elbo.item(),
            'val elbo': val_elbo.item()
        }, epoch)
        train_curve.append(train_elbo)
        val_curve.append(val_elbo)
        print(f"[Epoch {epoch}] train elbo: {train_elbo} val_elbo: {val_elbo}")
        #       """

        # --------------------------------------------------------------------
        #  Add functionality to plot samples from model during training.
        #  You can use the make_grid functioanlity that is already imported.
        # --------------------------------------------------------------------

        if epoch == 36:
            torch.save(model.state_dict(),
                       'manifoldstate' + str(ARGS.zdim) + '.pt')
        #model.load_state_dict(torch.load('modelstate/modelstate30.pt'))
        #model.eval()

        model_im = model.sample(9)[0]
        im_grid = make_grid(model_im, nrow=3)
        writer.add_image('data/DecoIm', im_grid, epoch)

        #plt.imshow(im_grid.permute(1, 2, 0))
        #plt.axis('off')
        #plt.savefig('VAEsample' + str(epoch) + '.png')
        #plt.close()

        # --------------------------------------------------------------------
        #  Add functionality to plot the learned data manifold after
        #  if required (i.e., if zdim == 2). You can use the make_grid
        #  functionality that is already imported.
        # --------------------------------------------------------------------
        if ARGS.zdim == 2:

            x = torch.linspace(norm.ppf(0.1), norm.ppf(0.9), 10)
            xx, xy = torch.meshgrid(x, x)
            z_mesh = torch.stack([xx, xy], 0)
            z_mesh = z_mesh.view(2, -1).t()
            model_bern = model.sample(1, z_mesh)[1]
            im_grid = make_grid(model_bern, nrow=10)
            writer.add_image('data/ManifoldIm', im_grid, epoch)

            #plt.imshow(im_grid.permute(1, 2, 0))
            #plt.axis('off')
            #plt.savefig('VAEmanifold.png')

    save_elbo_plot(train_curve, val_curve, 'elbo.pdf')
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
model.default_var_location = genn_wrapper.VarLocation_DEVICE
model.default_sparse_connectivity_location = genn_wrapper.VarLocation_DEVICE

lif_init = {
    "V": genn_model.init_var("Normal", {
        "mean": -58.0,
        "sd": 5.0
    }),
    "RefracTime": 0.0
}
poisson_init = {"current": 0.0}

exp_curr_params = {"tau": 0.5}

quantile = 0.9999
normal_quantile_cdf = norm.ppf(quantile)
max_delay = {
    pop: MEAN_DELAY[pop] + (DELAY_SD[pop] * normal_quantile_cdf)
    for pop in POPULATION_NAMES
}
print("Max excitatory delay:%fms , max inhibitory delay:%fms" %
      (max_delay["E"], max_delay["I"]))

# Calculate maximum dendritic delay slots
# **NOTE** it seems inefficient using maximum for all but this allows more aggressive merging of postsynaptic models
max_dendritic_delay_slots = int(round(max(itervalues(max_delay)) / DT_MS))
print("Max dendritic delay slots:%d" % max_dendritic_delay_slots)

print("Creating neuron populations:")
total_neurons = 0
neuron_populations = {}
Example #49
0
def compute_association(target,
                        features,
                        function=information_coefficient,
                        dropna='all',
                        target_ascending=False,
                        features_ascending=False,
                        n_jobs=1,
                        min_n_per_job=100,
                        n_features=0.95,
                        n_samplings=30,
                        confidence=0.95,
                        n_permutations=30,
                        random_seed=RANDOM_SEED,
                        filepath=None):
    """
    Compute: score_i = function(target, feature_i) for all features.
    Compute confidence interval (CI) for n_features features.
    Compute P-value and FDR (BH) for all features.
    :param target: Series; (n_samples); must have name and indices, matching features's column index
    :param features: DataFrame; (n_features, n_samples); must have row and column indices
    :param function: function; scoring function
    :param dropna: str; 'any' or 'all'
    :param target_ascending: bool; target is ascending or not
    :param n_jobs: int; number of jobs to parallelize
    :param min_n_per_job: int; minimum number of n per job
    :param features_ascending: bool; True if features scores increase from top to bottom, and False otherwise
    :param n_features: int or float; number of features to compute confidence interval and plot;
                        number threshold if >= 1, percentile threshold if < 1, and don't compute if None
    :param n_samplings: int; number of bootstrap samplings to build distribution to get CI; must be > 2 to compute CI
    :param confidence: float; fraction compute confidence interval
    :param n_permutations: int; number of permutations for permutation test to compute P-val and FDR
    :param random_seed: int;
    :param filepath: str;
    :return: Series, DataFrame, DataFrame; (n_features, 8 ('score', '<confidence> moe',
                                            'p-value (forward)', 'p-value (reverse)', 'p-value',
                                            'fdr (forward)', 'fdr (reverse)', 'fdr'))
    """

    # TODO: make empty DataFrame to absorb the results instead of concatenation

    # Make sure target is a Series and features a DataFrame
    # Keep samples found in both target and features
    # Drop features with less than 2 unique values
    target, features = _preprocess_target_and_features(
        target, features, dropna=dropna, target_ascending=target_ascending)

    results = DataFrame(index=features.index,
                        columns=[
                            'score', '{} moe'.format(confidence),
                            'p-value (forward)', 'p-value (reverse)',
                            'p-value', 'fdr (forward)', 'fdr (reverse)', 'fdr'
                        ])

    #
    # Compute: score_i = function(target, feature_i)
    #
    print_log('Scoring (n_jobs={}) ...'.format(n_jobs))

    # Split features for parallel computing
    if features.shape[0] < n_jobs * min_n_per_job:
        n_jobs = 1
    split_features = split_dataframe(features, n_jobs)

    # Score
    # scores = _score((target,features,function))
    #
    scores = concat(parallelize(_score, [(target, f, function)
                                         for f in split_features], n_jobs),
                    verify_integrity=True)

    # Load scores and sort results by scores
    results.ix[scores.index, 'score'] = scores
    results.sort_values('score', ascending=features_ascending, inplace=True)

    #
    #  Compute CI using bootstrapped distribution
    #
    if n_samplings < 2:
        print_log('Not computing CI because n_samplings < 2.')

    elif ceil(0.632 * features.shape[1]) < 3:
        print_log('Not computing CI because 0.632 * n_samples < 3.')

    else:
        print_log(
            'Computing {} CI for using distributions built by {} bootstraps ...'
            .format(confidence, n_samplings))
        indices_to_bootstrap = get_top_and_bottom_indices(
            results, 'score', n_features)

        # Bootstrap: for n_sampling times, randomly choose 63.2% of the samples, score, and build score distribution
        sampled_scores = DataFrame(index=indices_to_bootstrap,
                                   columns=range(n_samplings))
        seed(random_seed)
        for c_i in sampled_scores:
            # Random sample
            ramdom_samples = choice(features.columns.tolist(),
                                    int(ceil(0.632 *
                                             features.shape[1]))).tolist()
            sampled_target = target.ix[ramdom_samples]
            sampled_features = features.ix[indices_to_bootstrap,
                                           ramdom_samples]
            rs = get_state()

            # Score
            sampled_scores.ix[:, c_i] = sampled_features.apply(
                lambda f: function(sampled_target, f), axis=1)

            set_state(rs)

        # Compute scores' confidence intervals using bootstrapped score distributions
        # TODO: improve confidence interval calculation
        z_critical = norm.ppf(q=confidence)

        # Load confidence interval
        results.ix[sampled_scores.index,
                   '{} moe'.format(confidence)] = sampled_scores.apply(
                       lambda f: z_critical * (f.std() / sqrt(n_samplings)),
                       axis=1)

    #
    # Compute P-values and FDRs by sores against permuted target
    #
    if n_permutations < 1:
        print_log('Not computing P-value and FDR because n_perm < 1.')
    else:
        print_log(
            'Computing P-value & FDR by scoring against {} permuted targets (n_jobs={}) ...'
            .format(n_permutations, n_jobs))

        # Permute and score
        permutation_scores = concat(
            parallelize(_permute_and_score,
                        [(target, f, function, n_permutations, random_seed)
                         for f in split_features], n_jobs),
            verify_integrity=True)

        print_log('\tComputing P-value and FDR ...')
        # All scores
        all_permutation_scores = permutation_scores.values.flatten()
        for i, (r_i, r) in enumerate(results.iterrows()):
            # This feature's score
            s = r.ix['score']

            # Compute forward P-value
            p_value_forward = (all_permutation_scores >=
                               s).sum() / len(all_permutation_scores)
            if not p_value_forward:
                p_value_forward = float(1 / len(all_permutation_scores))
            results.ix[r_i, 'p-value (forward)'] = p_value_forward

            # Compute reverse P-value
            p_value_reverse = (all_permutation_scores <=
                               s).sum() / len(all_permutation_scores)
            if not p_value_reverse:
                p_value_reverse = float(1 / len(all_permutation_scores))
            results.ix[r_i, 'p-value (reverse)'] = p_value_reverse

        # Compute forward FDR
        results.ix[:, 'fdr (forward)'] = multipletests(
            results.ix[:, 'p-value (forward)'], method='fdr_bh')[1]

        # Compute reverse FDR
        results.ix[:, 'fdr (reverse)'] = multipletests(
            results.ix[:, 'p-value (reverse)'], method='fdr_bh')[1]

        # Creating the summary P-value and FDR
        forward = results.ix[:, 'score'] >= 0
        results.ix[:, 'p-value'] = concat([
            results.ix[forward, 'p-value (forward)'],
            results.ix[~forward, 'p-value (reverse)']
        ])
        results.ix[:, 'fdr'] = concat([
            results.ix[forward, 'fdr (forward)'], results.ix[~forward,
                                                             'fdr (reverse)']
        ])

    # Save
    if filepath:
        establish_filepath(filepath)
        results.to_csv(filepath, sep='\t')

    return target, features, results
Example #50
0
def nn_trainer(train_mark, model, train_data, test_conv_X, test_data_X,
               test_data_Y, trainer_params_list, ctx):
    """Parsing the params list"""
    ### The data
    batch_size = trainer_params_list['batch_size']
    epochs = trainer_params_list['epoch_num']

    loss_func = Gaussian_loss
    initializer = trainer_params_list['initializer']
    optimizer = trainer_params_list['optimizer']
    optimizer_params = trainer_params_list['optimizer_params']

    #train_iter = gluon.data.DataLoader(train_data, batch_size, shuffle=True)
    ### The model
    mx.random.seed(123456)
    model.collect_params().initialize(initializer, ctx=ctx)
    trainer = gluon.Trainer(model.collect_params(),
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)
    n_train = len(train_data)
    n_test = len(test_data_Y)
    ### The training process
    for e in range(epochs):
        start = time.time()
        train_loss = 0
        k = 0
        train_iter = gluon.data.DataLoader(train_data,
                                           batch_size,
                                           shuffle=True)
        for conv_data, data, label in train_iter:
            label = label.as_in_context(ctx)
            with autograd.record():
                output_mu, output_sigma = model(data, conv_data)
                loss = loss_func(output_mu, output_sigma, label)
            loss.backward()
            trainer.step(1, ignore_stale_grad=True)
            train_loss += nd.sum(loss).asscalar()
            k += 1
            if k * batch_size > n_train * 0.3:
                print('training_data_nb:', k * batch_size)
                break
        ### The test loss
        valid_mu, valid_sigma = DLPred2(model, test_data_X, test_conv_X)

        valid_loss = loss_func(valid_mu, (valid_sigma),
                               (test_data_Y)).asscalar()

        #rho50 = rho_risk(0,7, valid_mu.asnumpy(), test_data_Y.asnumpy(), 0.5)
        avg_rho50 = avg_rho_risk(valid_mu.asnumpy(), test_data_Y.asnumpy(),
                                 0.5, 7)

        valid_pred90 = norm.ppf(0.9, valid_mu.asnumpy(), valid_sigma.asnumpy())
        #  print(valid_mu[0:5,:])
        #  print( valid_sigma[0:5,:])
        #  print(valid_pred90[0:5,:])

        # rho90 = rho_risk(0,7, valid_pred90, test_data_Y.asnumpy(), 0.9)
        avg_rho90 = avg_rho_risk(valid_pred90, test_data_Y.asnumpy(), 0.9, 7)

        #print("Epoch %d, valid loss: %f rho50: %f, rho90 %f" % (e, valid_loss, rho50,rho90))
        print("Epoch %d, valid loss: %f avg_rho50: %f, avg_rho90 %f" %
              (e, valid_loss, avg_rho50, avg_rho90))
        end = time.time()
        print('total_time:', end - start)
Example #51
0
File: test.py Project: vyraun/ais
from scipy.stats import norm


class Generator(object):
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim

    def __call__(self, z):
        return z * 2 + 3


generator = Generator(1, 1)
prior = NormalPrior()
kernel = ParsenDensityEstimator()
model = ais.Model(generator, prior, kernel, 0.25, 10000)

p = norm()
x = np.linspace(norm.ppf(0.01, loc=3, scale=2), norm.ppf(0.99, loc=3, scale=2),
                100)
p1 = norm.pdf(x, loc=3, scale=2)
xx = np.reshape(x, [100, 1])

schedule = ais.get_schedule(100, rad=4)
print(schedule)
p2 = np.exp(model.ais(xx, schedule))

plt.plot(x, p1)
plt.plot(x, p2)
plt.show()
"""
  Name     : c12_16_VaR_baesd_on_normality.py
  Book     : Python for Finance (2nd ed.)
  Publisher: Packt Publishing Ltd. 
  Author   : Yuxing Yan
  Date     : 6/6/2017
  email    : [email protected]
             [email protected]
"""

import numpy as np
import pandas as pd
from scipy.stats import norm
from matplotlib.finance import quotes_historical_yahoo_ochl as getData
#
ticker = 'WMT'  # input 1
n_shares = 500  # input 2
confidence_level = 0.99  # input 3
begdate = (2012, 1, 1)  # input 4
enddate = (2016, 12, 31)  # input 5
#
z = norm.ppf(1 - confidence_level)
x = getData(ticker, begdate, enddate, asobject=True, adjusted=True)
ret = x.aclose[1:] / x.aclose[:-1] - 1
#
position = n_shares * x.close[0]
mean = np.mean(ret)
std = np.std(ret)
#
VaR = position * (mean + z * std)
print("Holding=", position, "VaR=", round(VaR, 4), "tomorrow")
from scipy.stats import norm
import numpy as np
import matplotlib.pyplot as plt


mu = 10
sigma = 2

x = np.arange(norm.ppf(0.01,loc=mu,scale=sigma), norm.ppf(0.99,loc=mu, scale=sigma), 0.1)
print(x)
fig, [axpdf, axcdf, axhisto] = plt.subplots(1,3)


randVect = norm.rvs(loc=mu, scale=sigma, size=1000)

axpdf.plot(x,norm.pdf(x,mu), 'r-',label='PDF')
axpdf.legend(loc='best')
axcdf.plot(x,norm.cdf(x,mu), 'r-', label='CDF')
axcdf.legend(loc='best')
axhisto.hist(randVect, color='0.75', label='Normally distributed values')
axhisto.legend(loc='best')
fig.tight_layout()
plt.show()


Example #54
0
    def __call__(self, sampled_parameters, loglikelihood, start_param_vec, ns_boundary, **kwargs):
        """Run the sampler.

        Args:
            sampled_parameters (:obj:`list` of
                :obj:`gleipnir.sampled_parameter.SampledParameter`): The
                parameters that are being sampled.
            loglikelihood (function): The log likelihood function.
            start_param_vec (obj:`numpy.ndarray`): The starting position of
                parameter vector for the parameters being sampled.
            ns_boundary (float): The current lower likelihood bound from the
            Nested Sampling routine.
            kwargs (dict): Pass in any other method specific keyword arguments.
        """
        if self._first:
            self._ndim = len(sampled_parameters)
            for sampled_parameter in sampled_parameters:
                rs = sampled_parameter.rvs(100)
                mirs = np.min(rs)
                mars = np.max(rs)
                width = mars - mirs
                #print(width)
                self._widths.append(0.5*width)
            #steps.append(0.5*width)
            self._widths = np.array(self._widths)
            self._first = False

        start_likelihood = loglikelihood(start_param_vec)

        # Tuning cycles
        steps = self._widths.copy()
        acceptance = np.zeros(self._ndim)
        cur_point = start_param_vec.copy()
        cur_likelihood = start_likelihood
        for i in range(self.tuning_cycles):
            for k in range(20):
                rsteps = np.random.random(self._ndim)
                u = np.random.random(self._ndim)
                for j in range(self._ndim):
                    new_point = cur_point.copy()
                    cur_pointj = cur_point[j]
                    widthj = self._widths[j]
                    # Generate the appropriate proposal distribution
                    if self.proposal == 'normal':
                        new_pointj = norm.ppf(rsteps[j],loc=cur_pointj, scale=widthj)
                    else:
                        new_pointj = uniform.ppf(rsteps[j],loc=cur_pointj-(widthj/2.0), scale=widthj)

                    new_point[j] = new_pointj
                    cur_priorj = sampled_parameters[j].prior(cur_pointj)
                    new_priorj = sampled_parameters[j].prior(new_point[j])
                    ratio = new_priorj/cur_priorj
                    new_likelihood = loglikelihood(new_point)
                    # Metropolis criterion with NS boundary
                    if (u[j] < ratio) and (new_likelihood > ns_boundary):
                        # accept the new point and update
                        cur_point[j] = new_pointj
                        cur_likelihood = new_likelihood
                        acceptance[j] += 1.0
                # Adjust the step sizes
                acceptance_ratio = acceptance/20.0
                less_than_mask = acceptance_ratio < 0.2
                gt_mask = acceptance_ratio > 0.6
                steps[less_than_mask] *= 0.66
                steps[gt_mask] *= 1.33
                acceptance[:] = 0.0

        # Start the sampling chain
        self._widths = steps.copy()
        cur_point = start_param_vec.copy()
        # curr_likelihood = start_likelihood
        for i in range(self.iterations+self.burn_in):
                rsteps = np.random.random(self._ndim)
                u = np.random.random(self._ndim)
                for j in range(self._ndim):
                    new_point = cur_point.copy()
                    cur_pointj = cur_point[j]
                    widthj = self._widths[j]
                    # Generate the appropriate proposal distribution
                    if self.proposal == 'normal':
                        new_pointj = norm.ppf(rsteps[j],loc=cur_pointj, scale=widthj)
                    else:
                        new_pointj = uniform.ppf(rsteps[j],loc=cur_pointj-(widthj/2.0), scale=widthj)

                    cur_priorj = sampled_parameters[j].prior(cur_pointj)
                    new_priorj = sampled_parameters[j].prior(new_point[j])
                    ratio = new_priorj/cur_priorj
                    #print("ratio",ratio, "cur_priorj", cur_priorj, "new_priorj", new_priorj, "cur_pointj", cur_pointj, "new_pointj", new_pointj, "rstepj", rsteps[j])
                    new_likelihood = loglikelihood(new_point)
                    # Metropolis criterion with NS boundary
                    if (u[j] < ratio) and (new_likelihood > ns_boundary):
                        # accept the new point and update
                        cur_point[j] = new_pointj
                        cur_likelihood = new_likelihood


        return cur_point, cur_likelihood
Example #55
0
                       np.arange(0.2, 2.0, 0.01), np.arange(2.0, 10.1, 0.1)))
#list of viewing angles
thetas = np.linspace(0, 180, 100)
#SNR of 1, 2, 3, 4, 5
confs = [norm.cdf(sn) for sn in limSNs]
print "Confidence levels:", confs
print "Sigma levels:", limSNs
print "Trial a13s:", a13s
print "Trial thetas:", thetas

#for each confidence interval
for n, conf in enumerate(confs):
    #array to hold percent of viewing angles ruled out for each a13 at this conf
    outangles = np.zeros(len(a13s))
    #sigma needed to establish confidence below LC
    sig = norm.ppf(conf)
    #for each a13 model
    for j, a13 in enumerate(a13s):
        print "Testing model at a13:", a13
        #boolean mask for whether angle is ruled out to given confidence
        mask = np.array([True] * len(thetas))
        #for each band
        for i in range(len(t)):
            #generate theoretical light curve
            Lk = np.array([
                KasenFit(ti, a13, 1.0, wave_0[bands[band[i]]], m_c, e_51, z, 0)
                for ti in t[i]
            ])

            #compare to observed for each viewing angle
            for k, theta in enumerate(thetas):
encoder = Model(x, z_mu)

# display a 2D plot of the digit classes in the latent space
z_test = encoder.predict(x_test, batch_size=batch_size)
plt.figure(figsize=(6, 6))
plt.scatter(z_test[:, 0],
            z_test[:, 1],
            c=y_test,
            alpha=.4,
            s=3**2,
            cmap='viridis')
plt.colorbar()
plt.show()

# display a 2D manifold of the digits
n = 15  # figure with 15x15 digits
digit_size = 28

# linearly spaced coordinates on the unit square were transformed
# through the inverse CDF (ppf) of the Gaussian to produce values
# of the latent variables z, since the prior of the latent space
# is Gaussian
u_grid = np.dstack(
    np.meshgrid(np.linspace(0.05, 0.95, n), np.linspace(0.05, 0.95, n)))
z_grid = norm.ppf(u_grid)
x_decoded = decoder.predict(z_grid.reshape(n * n, 2))
x_decoded = x_decoded.reshape(n, n, digit_size, digit_size)

plt.figure(figsize=(10, 10))
plt.imshow(np.block(list(map(list, x_decoded))), cmap='gray')
plt.show()
def chamberlain(n, q, alpha=.05):
    return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n)
Example #58
0
sigma220 = (220**0.5) * sigma
print('The probability of dropping over 40% in 220 days is ',
      norm.cdf(-0.4, mu220, sigma220))

# In[7]:

#Probability
mu220 = 220 * mu
sigma220 = (220**0.5) * sigma
drop20 = norm.cdf(-0.2, mu220, sigma220)
print('The probability of dropping over 20% in 220 days is ', drop20)

# In[8]:

#Value at Risk
VaR = norm.ppf(0.05, mu, sigma)
print('Single day Value at Risk ', VaR)

# In[9]:

#Value at Risk
print('5% quantile ', norm.ppf(0.05, mu, sigma))
print('95%quantile', norm.ppf(0.95, mu, sigma))

# In[10]:

#Value at Risk
q25 = norm.ppf(0.25, mu, sigma)
print('25% quantile', q25)
q75 = norm.ppf(0.75, mu, sigma)
print('75% quantile', q75)
    def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
            max_iter=1000, p_tol=1e-6, **kwargs):
        '''Solve by Iterative Weighted Least Squares

        Parameters
        ----------
        q : float
            Quantile must be between 0 and 1
        vcov : string, method used to calculate the variance-covariance matrix
            of the parameters. Default is ``robust``:

            - robust : heteroskedasticity robust standard errors (as suggested
              in Greene 6th edition)
            - iid : iid errors (as in Stata 12)

        kernel : string, kernel to use in the kernel density estimation for the
            asymptotic covariance matrix:

            - epa: Epanechnikov
            - cos: Cosine
            - gau: Gaussian
            - par: Parzene

        bandwidth: string, Bandwidth selection method in kernel density
            estimation for asymptotic covariance estimate (full
            references in QuantReg docstring):

            - hsheather: Hall-Sheather (1988)
            - bofinger: Bofinger (1975)
            - chamberlain: Chamberlain (1994)
        '''

        if q < 0 or q > 1:
            raise Exception('p must be between 0 and 1')

        kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
        if kernel not in kern_names:
            raise Exception("kernel must be one of " + ', '.join(kern_names))
        else:
            kernel = kernels[kernel]

        if bandwidth == 'hsheather':
            bandwidth = hall_sheather
        elif bandwidth == 'bofinger':
            bandwidth = bofinger
        elif bandwidth == 'chamberlain':
            bandwidth = chamberlain
        else:
            raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")

        endog = self.endog
        exog = self.exog
        nobs = self.nobs
        exog_rank = np_matrix_rank(self.exog)
        self.rank = exog_rank
        self.df_model = float(self.rank - self.k_constant)
        self.df_resid = self.nobs - self.rank
        n_iter = 0
        xstar = exog

        beta = np.ones(exog_rank)
        # TODO: better start, initial beta is used only for convergence check

        # Note the following doesn't work yet,
        # the iteration loop always starts with OLS as initial beta
#        if start_params is not None:
#            if len(start_params) != rank:
#                raise ValueError('start_params has wrong length')
#            beta = start_params
#        else:
#            # start with OLS
#            beta = np.dot(np.linalg.pinv(exog), endog)

        diff = 10
        cycle = False

        history = dict(params = [], mse=[])
        while n_iter < max_iter and diff > p_tol and not cycle:
            n_iter += 1
            beta0 = beta
            xtx = np.dot(xstar.T, exog)
            xty = np.dot(xstar.T, endog)
            beta = np.dot(pinv(xtx), xty)
            resid = endog - np.dot(exog, beta)

            mask = np.abs(resid) < .000001
            resid[mask] = np.sign(resid[mask]) * .000001
            resid = np.where(resid < 0, q * resid, (1-q) * resid)
            resid = np.abs(resid)
            xstar = exog / resid[:, np.newaxis]
            diff = np.max(np.abs(beta - beta0))
            history['params'].append(beta)
            history['mse'].append(np.mean(resid*resid))

            if (n_iter >= 300) and (n_iter % 100 == 0):
                # check for convergence circle, shouldn't happen
                for ii in range(2, 10):
                    if np.all(beta == history['params'][-ii]):
                        cycle = True
                        break
                warnings.warn("Convergence cycle detected", ConvergenceWarning)

        if n_iter == max_iter:
            warnings.warn("Maximum number of iterations (1000) reached.",
                          IterationLimitWarning)

        e = endog - np.dot(exog, beta)
        # Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
        # h = 0.9 * np.std(e) / (nobs**0.2)
        # Instead, we calculate bandwidth as in Stata 12
        iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
        h = bandwidth(nobs, q)
        h = min(np.std(endog),
                iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))

        fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))

        if vcov == 'robust':
            d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
            xtxi = pinv(np.dot(exog.T, exog))
            xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
            vcov = chain_dot(xtxi, xtdx, xtxi)
        elif vcov == 'iid':
            vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
        else:
            raise Exception("vcov must be 'robust' or 'iid'")

        lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)

        lfit.q = q
        lfit.iterations = n_iter
        lfit.sparsity = 1. / fhat0
        lfit.bandwidth = h
        lfit.history = history

        return RegressionResultsWrapper(lfit)
Example #60
0
def analyze(problem,
            Y,
            calc_second_order=True,
            num_resamples=100,
            conf_level=0.95,
            print_to_console=False,
            parallel=False,
            n_processors=None,
            seed=None):
    """Perform Sobol Analysis on model outputs.

    Returns a dictionary with keys 'S1', 'S1_conf', 'ST', and 'ST_conf', where
    each entry is a list of size D (the number of parameters) containing the
    indices in the same order as the parameter file.  If calc_second_order is
    True, the dictionary also contains keys 'S2' and 'S2_conf'.

    Parameters
    ----------
    problem : dict
        The problem definition
    Y : numpy.array
        A NumPy array containing the model outputs
    calc_second_order : bool
        Calculate second-order sensitivities (default True)
    num_resamples : int
        The number of resamples (default 100)
    conf_level : float
        The confidence interval level (default 0.95)
    print_to_console : bool
        Print results directly to console (default False)

    References
    ----------
    .. [1] Sobol, I. M. (2001).  "Global sensitivity indices for nonlinear
           mathematical models and their Monte Carlo estimates."  Mathematics
           and Computers in Simulation, 55(1-3):271-280,
           doi:10.1016/S0378-4754(00)00270-6.
    .. [2] Saltelli, A. (2002).  "Making best use of model evaluations to
           compute sensitivity indices."  Computer Physics Communications,
           145(2):280-297, doi:10.1016/S0010-4655(02)00280-1.
    .. [3] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
           S. Tarantola (2010).  "Variance based sensitivity analysis of model
           output.  Design and estimator for the total sensitivity index."
           Computer Physics Communications, 181(2):259-270,
           doi:10.1016/j.cpc.2009.09.018.

    Examples
    --------
    >>> X = saltelli.sample(problem, 1000)
    >>> Y = Ishigami.evaluate(X)
    >>> Si = sobol.analyze(problem, Y, print_to_console=True)

    """
    if seed:
        np.random.seed(seed)
    # determining if groups are defined and adjusting the number
    # of rows in the cross-sampled matrix accordingly
    if not problem.get('groups'):
        D = problem['num_vars']
    else:
        D = len(set(problem['groups']))

    if calc_second_order and Y.size % (2 * D + 2) == 0:
        N = int(Y.size / (2 * D + 2))
    elif not calc_second_order and Y.size % (D + 2) == 0:
        N = int(Y.size / (D + 2))
    else:
        raise RuntimeError("""
        Incorrect number of samples in model output file.
        Confirm that calc_second_order matches option used during sampling.""")

    if conf_level < 0 or conf_level > 1:
        raise RuntimeError("Confidence level must be between 0-1.")

    # normalize the model output
    Y = (Y - Y.mean()) / Y.std()

    A, B, AB, BA = separate_output_values(Y, D, N, calc_second_order)
    r = np.random.randint(N, size=(N, num_resamples))
    Z = norm.ppf(0.5 + conf_level / 2)

    if not parallel:
        S = create_Si_dict(D, calc_second_order)

        for j in range(D):
            S['S1'][j] = first_order(A, AB[:, j], B)
            S['S1_conf'][j] = Z * first_order(A[r], AB[r, j], B[r]).std(ddof=1)
            S['ST'][j] = total_order(A, AB[:, j], B)
            S['ST_conf'][j] = Z * total_order(A[r], AB[r, j], B[r]).std(ddof=1)

        # Second order (+conf.)
        if calc_second_order:
            for j in range(D):
                for k in range(j + 1, D):
                    S['S2'][j, k] = second_order(A, AB[:, j], AB[:, k],
                                                 BA[:, j], B)
                    S['S2_conf'][j, k] = Z * second_order(
                        A[r], AB[r, j], AB[r, k], BA[r, j], B[r]).std(ddof=1)

    else:
        tasks, n_processors = create_task_list(D, calc_second_order,
                                               n_processors)

        func = partial(sobol_parallel, Z, A, AB, BA, B, r)
        pool = Pool(n_processors)
        S_list = pool.map_async(func, tasks)
        pool.close()
        pool.join()

        S = Si_list_to_dict(S_list.get(), D, calc_second_order)

    # Print results to console
    if print_to_console:
        print_indices(S, problem, calc_second_order)

    # Add problem context and override conversion method for special case
    S.problem = problem
    S.to_df = MethodType(to_df, S)
    return S