def bootstrap(data, n_bootstraps, user_statistic, kwargs=None, pass_indices=False, random_state=None): """Compute bootstraped statistics of a dataset. Parameters ---------- data : array_like An n-dimensional data array of size n_samples by n_attributes n_bootstraps : integer the number of bootstrap samples to compute. Note that internally, two arrays of size (n_bootstraps, n_samples) will be allocated. For very large numbers of bootstraps, this can cause memory issues. user_statistic : function The statistic to be computed. This should take an array of data of size (n_bootstraps, n_samples) and return the row-wise statistics of the data. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. pass_indices : boolean (optional) if True, then the indices of the points rather than the points themselves are passed to `user_statistic` random_state: RandomState or an int seed (0 by default) A random number generator instance Returns ------- distribution : ndarray the bootstrapped distribution of statistics (length = n_bootstraps) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} rng = check_random_state(random_state) data = np.asarray(data) if data.ndim != 1: n_samples = data.shape[0] warnings.warn( "bootstrap data are n-dimensional: assuming ordered n_samples by n_attributes" ) else: n_samples = data.size # Generate random indices with repetition ind = rng.randint(n_samples, size=(n_bootstraps, n_samples)) data = data[ind].reshape(-1, data[ind].shape[-1]) # Call the function if pass_indices: stat_bootstrap = user_statistic(ind, **kwargs) else: stat_bootstrap = user_statistic(data, **kwargs) # compute the statistic on the data return stat_bootstrap
def smoothedbootstrap(data, n_bootstraps, user_statistic, kwargs=None, random_state=None): """Compute smoothed bootstrapped statistics of a data set. Parameters ---------- data : array_like A 1-dimensional data array of size n_samples n_bootstraps : integer the number of bootstrap samples to compute. Note that internally, two arrays of size (n_bootstraps, n_samples) will be allocated. For very large numbers of bootstraps, this can cause memory issues. user_statistic : function The statistic to be computed. This should take an array of data of size (n_bootstraps, n_samples) and return the row-wise statistics of the data. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. random_state: RandomState or an int seed (0 by default) A random number generator instance Returns ------- distribution : ndarray the bootstrapped distribution of statistics (length = n_bootstraps) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} rng = check_random_state(random_state) data = np.asarray(data) n_datapts = data.size if data.ndim != 1: raise ValueError("bootstrap expects 1-dimensional data") # Generate random indices with repetition ind = rng.randint(n_datapts, size=(n_bootstraps, n_datapts)) # smoothing noise noisemean = 0. noisesigma = np.std(data, ddof=1) / np.sqrt(n_datapts) noise = npr.normal(noisemean, noisesigma, (n_bootstraps, n_datapts)) databroadcast = data[ind] + noise # Call the function stat_bootstrap = user_statistic(databroadcast, **kwargs) # compute the statistic on the data return stat_bootstrap
def smoothedbootstrap(data, n_bootstraps, user_statistic, kwargs=None, random_state=None): """Compute smoothed bootstrapped statistics of a data set. Parameters ---------- data : array_like A 1-dimensional data array of size n_samples n_bootstraps : integer the number of bootstrap samples to compute. Note that internally, two arrays of size (n_bootstraps, n_samples) will be allocated. For very large numbers of bootstraps, this can cause memory issues. user_statistic : function The statistic to be computed. This should take an array of data of size (n_bootstraps, n_samples) and return the row-wise statistics of the data. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. random_state: RandomState or an int seed (0 by default) A random number generator instance Returns ------- distribution : ndarray the bootstrapped distribution of statistics (length = n_bootstraps) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} rng = check_random_state(random_state) data = np.asarray(data) n_datapts = data.size if data.ndim != 1: raise ValueError("bootstrap expects 1-dimensional data") # Generate random indices with repetition ind = rng.randint(n_datapts, size=(n_bootstraps, n_datapts)) # smoothing noise noisemean = 0. noisesigma = np.std(data,ddof=1) / np.sqrt(n_datapts) noise = npr.normal(noisemean,noisesigma,(n_bootstraps, n_datapts)) databroadcast = data[ind] + noise # Call the function stat_bootstrap = user_statistic(databroadcast, **kwargs) # compute the statistic on the data return stat_bootstrap
def bootstrap(data, n_bootstraps, user_statistic, kwargs=None, pass_indices=False, random_state=None): """Compute bootstraped statistics of a dataset. Parameters ---------- data : array_like A 1-dimensional data array of size n_samples n_bootstraps : integer the number of bootstrap samples to compute. Note that internally, two arrays of size (n_bootstraps, n_samples) will be allocated. For very large numbers of bootstraps, this can cause memory issues. user_statistic : function The statistic to be computed. This should take an array of data of size (n_bootstraps, n_samples) and return the row-wise statistics of the data. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. pass_indices : boolean (optional) if True, then the indices of the points rather than the points themselves are passed to `user_statistic` random_state: RandomState or an int seed (0 by default) A random number generator instance Returns ------- distribution : ndarray the bootstrapped distribution of statistics (length = n_bootstraps) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} rng = check_random_state(random_state) data = np.asarray(data) n_samples = data.size if data.ndim != 1: raise ValueError("bootstrap expects 1-dimensional data") # Generate random indices with repetition ind = rng.randint(n_samples, size=(n_bootstraps, n_samples)) # Call the function if pass_indices: stat_bootstrap = user_statistic(ind, **kwargs) else: stat_bootstrap = user_statistic(data[ind], **kwargs) # compute the statistic on the data return stat_bootstrap
def bootstrap_correln_results(xydata, Nbootstraps, random_state=None): xdata = xydata[:, 0] ydata = xydata[:, 1] Ndata = len(xdata) Ntests = 3 results = np.zeros((Ntests, Nbootstraps)) rng = check_random_state(random_state) ind = rng.randint(Ndata, size=(Ndata, Nbootstraps)) for k in range(Nbootstraps): for i, statistic in enumerate( [stats.pearsonr, stats.spearmanr, stats.kendalltau]): results[i, k] = statistic(xdata[ind[:, k]], ydata[ind[:, k]])[0] return results
def newdatafullbootstrap(runindex): # NOTE: all print and plot commands removed, would be blocked by # multiprocessing wrapper anyway nproc = 3 # Generate a fresh seed npr.seed() # Generating fake data set to start with: alphatrue=2. # slope betatrue=5. # intercept errs=2.5 # sigma (amplitude of errors) narr=50 # number of data points xvals = np.arange(narr) + 1. # xvals range from 1-51 yvals = alphatrue*xvals + betatrue + npr.normal(0,errs,narr) # yvals # Determine slope & y-intercept using least squares analytic solution alphaest=(np.mean(xvals)*np.mean(yvals)-np.mean(xvals*yvals)) / \ (np.mean(xvals)**2 -np.mean(xvals**2)) # from derivation betaest= np.mean(yvals) - alphaest * np.mean(xvals) # calculate estimate of y-intercept from derivation # The MLE values of the slope and y-intercept are equivalent to the "least # squares" fit results. # Compute analytic uncertainties on slope and y-intercept alphaunc = np.sqrt(np.sum((yvals - (alphaest*xvals+betaest))**2) / ((narr-2.)*(np.sum((xvals-np.mean(xvals))**2)))) betaunc = np.sqrt((np.sum((yvals - (alphaest*xvals+betaest))**2) / (narr-2.)) * ((1./narr) + (np.mean(xvals)**2)/np.sum((xvals-np.mean(xvals))**2)) ) # Solution using python solver np.polyfit # third parameter is order of fit, 1 for linear # duplicate np.polyfit command below can be commented out now # pfit = np.polyfit(xvals, yvals, 1) # returns coeff. of highest order term first # Can also obtain parameter uncertainties from the diagonal terms of the covariance # matrix, which is the inverse of the Hessian matrix and # can be computed in np.polyfit by setting cov='True' pfit,covp = np.polyfit(xvals, yvals, 1, cov='True') # returns coeff. of highest power first # setting cov='True' returns the covariance matrix # BOOTSTRAP! npars = 2 # slope and intercept nboot = 1000 # usually want at least 1000 rng = check_random_state(None) ind = rng.randint(narr, size=(narr,nboot)) #bootresults = np.zeros((npars,nboot)) bootres = jl.Parallel(n_jobs = nproc)(jl.delayed(np.polyfit)(xvals[ind[:,iboot]], yvals[ind[:,iboot]], 1) for iboot in xrange(nboot)) sloperesults,intresults = zip(*bootres) #for iboot in xrange(nboot): # bootresults[:,iboot] = np.polyfit(xvals[ind[:,iboot]], yvals[ind[:,iboot]], 1) #sloperesults = bootresults[0,:] #intresults = bootresults[1,:] slopesort = np.argsort(sloperesults) slopemed = np.median(sloperesults) pct16 = int(round(0.16*nboot)) pct84 = int(round(0.84*nboot)) slope68pcterrs = [slopemed-sloperesults[slopesort[pct16]],sloperesults[slopesort[pct84]]-slopemed] intsort = np.argsort(intresults) intmed = np.median(intresults) int68pcterrs = [intmed-intresults[intsort[pct16]],intresults[intsort[pct84]]-intmed] slope_err_ratio = 0.5*(np.sum(slope68pcterrs))/np.sqrt(covp[0,0]) int_err_ratio = 0.5*(np.sum(int68pcterrs))/np.sqrt(covp[1,1]) slope_err_ratio2 = 0.5*(np.sum(slope68pcterrs))/alphaunc int_err_ratio2 = 0.5*(np.sum(int68pcterrs))/betaunc return runindex, slope_err_ratio, int_err_ratio, slope_err_ratio2, int_err_ratio2
# Can also obtain parameter uncertainties from the diagonal terms of the covariance # matrix, which is the inverse of the Hessian matrix and # can be computed in np.polyfit by setting cov='True' pfit, covp = np.polyfit(xvals, yvals, 1, cov='True') # returns coeff. of highest power first # setting cov='True' returns the covariance matrix print("slope is %0.7f +- %0.7f" % (pfit[0], np.sqrt(covp[0, 0]))) print("intercept is %0.7f +- %0.7f" % (pfit[1], np.sqrt(covp[1, 1]))) # BOOTSTRAP! npars = 2 # slope and intercept nboot = 10000 # usually want at least 1000 rng = check_random_state(None) ind = rng.randint(narr, size=(narr, nboot)) bootresults = np.zeros((npars, nboot)) for iboot in xrange(nboot): bootresults[:, iboot] = np.polyfit(xvals[ind[:, iboot]], yvals[ind[:, iboot]], 1) sloperesults = bootresults[0, :] intresults = bootresults[1, :] slopesort = np.argsort(sloperesults) slopemed = np.median(sloperesults) pct16 = int(round(0.16 * nboot)) pct84 = int(round(0.84 * nboot)) slope68pcterrs = [ slopemed - sloperesults[slopesort[pct16]], sloperesults[slopesort[pct84]] - slopemed ]
# select the preferred frequentist linear fit between the forward, inverse, # and bisector fits, plot it, and print out its coefficients (slope and # intercept expressed assuming yy = slope*xx + intercept) # use bootstrap resampling of the data to determine the 68% confidence # intervals for the slope and intercept you computed above # (hint: you cannot use the pre-made bootstrap/smoothedbootstrap functions # but must construct a custom bootstrap as Ivezic Fig. 3.24 had to do) nboot = 100 Ndata = len(xx) sloperesults = np.zeros(nboot) intresults = np.zeros(nboot) random_state = None rng = check_random_state(random_state) ind = rng.randint(Ndata, size=(Ndata, nboot)) #for k in range(nboot): #slopesort=np.argsort(sloperesults) #intsort=np.argsort(intresults) #print "68%% confidence interval for slope: %0.2f -- %0.2f" % (,) #print "68%% confidence interval for intercept: %0.2f -- %0.2f" % (,) # now suppose we wish to determine whether a 2nd order model is # superior to a 1st order one for this data set -- this problem is too # hard for a 3-hour test if we allow for errors in both variables, so # we'll assume from now on that all the scatter is in the yy direction # perform 1st and 2nd order polyfit results
def LS_window_white_noise(t, omega, y=0, dy=1, generalized=False, subtract_mean=False, random_state=None, N_mock=100, hist=False, plot_hist=True,Nbins=200): """Use a monte carlo simulation to compute Lomb-Scargle white noise significance for the given spectral window Parameters ---------- The first set of parameters are passed to the lomb_scargle algorithm t : array_like sequence of times omega : array_like frequencies at which to evaluate p(omega) generalized : bool if True (default) use generalized lomb-scargle method otherwise, use classic lomb-scargle. subtract_mean : bool if True (default) subtract the sample mean from the data before computing the periodogram. Only referenced if generalized is False Remaining parameters control the bootstrap N_mock : int number of simulations random_state : None, int, or RandomState object random seed, or random number generator Returns ------- D : ndarray distribution of the height of the highest peak if hist=True: omegaD : ndarray distribution of the angular frecuencies corresponding to D """ random_state = check_random_state(random_state) t = np.asarray(t) #dy = np.ones_like(y) D = np.zeros(N_mock) omegaD= np.zeros(N_mock) #PS_mock_all=[] #PS_mock_max=np.array([]) #omega_mock_max=np.array([]) for i in range(N_mock): #ind = random_state.randint(0, len(y), len(y)) y = np.random.normal(y, dy , size=len(t)) #print y p = lomb_scargle(t, y, dy, omega, generalized=generalized, subtract_mean=subtract_mean) #print p D[i] = p.max() omegaD[i]=omega[p.argmax()] #max_PS_mock=np.max(PS_mock_all,axis=0) if hist: if plot_hist: from matplotlib import pyplot as plt frecD=omegaD.copy()/(2*np.pi) plt.figure('white noise peak hist') plt.hist(frecD,normed=True, bins=Nbins) plt.hist(frecD,normed=True,histtype='step') plt.figure('white noise peak cumhist') Xcum=np.sort(D) Ycum=np.array(range(N_mock))/float(N_mock) plt.plot(Xcum,Ycum) #plt.xlim(Xcum,Xcum) plt.grid() plt.minorticks_on() plt.xlabel('') return D,omegaD else: return D
def two_point_angular(ra, dec, bins, BT_D=None, BT_R=None, method='standard', ra_R=None, dec_R=None, random_state=None, return_trees=False, verbose=False, RR=None, return_RR=False, return_DD=False): """ Angular two-point correlation function A separate function is needed because angular distances are not euclidean, and random sampling needs to take into account the spherical volume element. There are a number of options for what gets returned. The order will always be correlation_function, data_balltree, random_balltree, random_random, data_data. If the user asks for a subset of those, the list will be shorter but the order will be maintained. Parameters ---------- ra : array_like input right ascention, shape = (n_samples,) dec : array_like input declination, shape = (n_samples,) bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 BT_D : BallTree (optional) ball tree created with the data positions. The positions given to the BallTree should be Euclidean and not angular BT_R : BallTree (optional) ball tree created with the random positions. The positions given to the BallTree should be Euclidean and not angular method : string (optional) "standard" or "landy-szalay". Default is 'standard' ra_R, dec_R : array_like (optional if no BT_R) the random sample to be used. If you pass BT_R as an argument, you must also pass the random sample it was made from with this random_state : integer, np.random.RandomState, or None (optional) specify the random state to use for generating background. Default is None RR : array-like, shape = Nbins (optional) If this exact set of randoms and theta bins has been run, you can supply the RR counts and not calculate them again. You also need the data if you're running landy-szalay. return_RR : boolean (optional) If you know you'll be running a CF with this exact same random sample and binning (like with a bootstrap), you can get the RR counts returned and feed them back in the next time. Default is False. return_DD : boolean (optional) In case you want to fit to the pair counts rather than the w(theta) estimator, you can get this back too. Default is False return_trees : boolean (optional) if True, the returns will include the ball trees for the data and random sets. Default is False Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins data_tree : BallTree (optional) the ball tree used to calculate distances between objects quickly in the data. only returned if return_trees == True random_tree : BallTree (optional) the ball tree used to calculate distances between objects quickly in the randomly generated set. only returned if return_trees == True RR : ndarray (optional) the RR counts may be returned (if return_RR==True) and used again without recomputing if the theta bins and the random sample is exactly the same DD : ndarray (optional) the DD pair counts, returned if return_DD==True """ ra = np.asarray(ra) dec = np.asarray(dec) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if (ra.ndim != 1) or (dec.ndim != 1) or (ra.shape != dec.shape): raise ValueError('ra and dec must be 1-dimensional ' 'arrays of the same length') n_features = len(ra) Nbins = len(bins) - 1 # draw a random sample with N points if (ra_R is None) | (dec_R is None): if verbose: print "two_point_angular says: generating random sample" ra_R, dec_R = uniform_sphere((min(ra), max(ra)), (min(dec), max(dec)), 2 * len(ra)) data = np.asarray(ra_dec_to_xyz(ra, dec), order='F').T data_R = np.asarray(ra_dec_to_xyz(ra_R, dec_R), order='F').T # convert spherical bins to cartesian bins bins_transform = angular_dist_to_euclidean_dist(bins) if verbose: print "two_point_angular says: transform complete, calling two_point" return two_point(data, bins_transform, method=method, BT_D=BT_D, BT_R=BT_R, data_R=data_R, random_state=rng, return_trees=return_trees, RR=RR, return_RR=return_RR, return_DD=return_DD)
def two_point(data, bins, BT_D=None, BT_R=None, method='standard', data_R=None, random_state=None, return_trees=False, verbose=False, RR=None, return_RR=False, return_DD=False): #Edited by CW to allow user to supply more things and have more things #returned. """ Two-point correlation function in Euclidean space. Options to return a number of things. What gets returned is up to the user but the order will always be correlation_function, data_balltree, random_balltree, random_random, data_data. If the user asks for a subset of those, the list will be shorter but the order will be maintained. Parameters ---------- data : array_like Input data, shape = [n_samples, n_features] bins : array_like Bins within which to compute the 2-point correlation. Shape = Nbins + 1 BT_D : BallTree (optional) Ball tree created with the data positions BT_R : BallTree (optional) Ball tree created with the random positions method : string (optional) "standard" or "landy-szalay". Default is 'standard'. data_R : array_like (optional if no BT_R) If specified, use this as the random comparison sample. This must be included if you wish to use a pre-computed random ball tree random_state : integer, np.random.RandomState, or None (optional) Specify the random state to use for generating background. Not used if the randoms are provided by the user. Default is None RR : 1D array-like, shape = Nbins If this exact set of randoms and theta bins has been run, you can supply the RR counts and not calculate them again. You also need the data if you're running with method='landy-szalay' return_trees : boolean (optional) If True, the returns will include the data and random ball trees. Default is False. return_RR : boolean (optional) If you know you'll be running a CF with this exact same random sample and binning (like with a bootstrap), you can get the RR counts returned and feed them back in the next time return_DD : boolean (optional) In case you want to fit to the pair counts rather than the w(theta) estimator, you can get this back too verbose: boolean (optional) Determines whether or not the function narrates what it's doing. Default is False. Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins data_tree : BallTree (optional) the ball tree used to calculate distances between objects quickly in the data. only returned if return_trees == True random_tree : BallTree (optional) the ball tree used to calculate distances between objects quickly in the randomly generated set. only returned if return_trees == True RR : ndarray (optional) the RR counts may be returned (if return_RR==True) and used again without recomputing if the theta bins and the random sample is exactly the same DD : ndarray (optional) the DD pair counts, returned if return_DD==True """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: print "two_point says: generating random sample" data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) if BT_D is None: if verbose: print "two_point says: computing BallTree for data" BT_D = BallTree(data) if BT_R is None: if verbose: print "two_point says: computing BallTree for random sample" BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1) counts_RR = np.zeros(Nbins + 1) if verbose: print "two_point says: working through the CF calc. This could take a while" for i in range(Nbins + 1): counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i], count_only=True)) if RR is None: counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i], count_only=True)) if verbose: print "two_point says: binning done!" DD = np.diff(counts_DD) if RR is None: RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor**2 * DD / RR - 1 elif method == 'landy-szalay': counts_DR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i], count_only=True)) DR = np.diff(counts_DR) corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan to_return=corr if return_trees: to_return=[to_return] to_return.append(BT_D) to_return.append(BT_R) if return_RR: if not return_trees: to_return=[to_return] to_return.append(RR) if return_DD: if (not return_trees) and (not return_RR): to_return=[to_return] to_return.append(DD) return to_return
def generate_power_DRW(N, dt, sigma, tau, generate_complex=False, random_state=None): import matplotlib.pyplot as plt from astroML.utils import check_random_state import numpy as np """Generate a power-law light curve This uses the method from Timmer & Koenig [1]_ Parameters ---------- N : integer Number of equal-spaced time steps to generate dt : float Spacing between time-steps beta : float Power-law index. The spectrum will be (1 / f)^beta generate_complex : boolean (optional) if True, generate a complex time series rather than a real time series random_state : None, int, or np.random.RandomState instance (optional) random seed or random number generator Returns ------- x : ndarray the length-N References ---------- .. [1] Timmer, J. & Koenig, M. On Generating Power Law Noise. A&A 300:707 """ random_state = check_random_state(random_state) dt = float(dt) N = int(N) Npos = int(N / 2) Nneg = int((N - 1) / 2) domega = (2 * np.pi / dt / N) df = (1 / dt / N) if generate_complex: omega = domega * np.fft.ifftshift(np.arange(N) - int(N / 2)) else: omega = domega * np.arange(Npos + 1) if generate_complex: freq = df * np.fft.ifftshift(np.arange(N) - int(N / 2)) else: freq = df * np.arange(Npos + 1) x_fft = np.zeros(len(omega), dtype=complex) x_fft.real[1:] = random_state.normal(0, 1, len(omega) - 1) x_fft.imag[1:] = random_state.normal(0, 1, len(omega) - 1) # x_fft[1:] *=2*sigma*np.sqrt(tau/(1+omega[1:]**2*tau**2)) x_fft[1:] *= np.sqrt(2) * sigma * np.sqrt(tau / (1 + omega[1:]**2 * tau**2)) x_fft[1:] *= (1. / np.sqrt(2)) if (not generate_complex) and (N % 2 == 0): x_fft.imag[-1] = 0 if generate_complex: x = np.fft.ifft(x_fft) else: x = np.fft.irfft(x_fft, N) return x
print("np.polyfit MLE y-intercept = %0.7f" %pfit[1]) # Can also obtain parameter uncertainties from the diagonal terms of the covariance # matrix, which is the inverse of the Hessian matrix and # can be computed in np.polyfit by setting cov='True' pfit,covp = np.polyfit(xvals, yvals, 1, cov='True') # returns coeff. of highest power first # setting cov='True' returns the covariance matrix print("slope is %0.7f +- %0.7f" % (pfit[0], np.sqrt(covp[0,0]))) print("intercept is %0.7f +- %0.7f" % (pfit[1], np.sqrt(covp[1,1]))) # BOOTSTRAP! npars = 2 # slope and intercept nboot = 10000 # usually want at least 1000 rng = check_random_state(None) ind = rng.randint(narr, size=(narr,nboot)) bootresults = np.zeros((npars,nboot)) for iboot in xrange(nboot): bootresults[:,iboot] = np.polyfit(xvals[ind[:,iboot]], yvals[ind[:,iboot]], 1) sloperesults = bootresults[0,:] intresults = bootresults[1,:] slopesort = np.argsort(sloperesults) slopemed = np.median(sloperesults) pct16 = int(round(0.16*nboot)) pct84 = int(round(0.84*nboot)) slope68pcterrs = [slopemed-sloperesults[slopesort[pct16]],sloperesults[slopesort[pct84]]-slopemed] intsort = np.argsort(intresults) intmed = np.median(intresults) int68pcterrs = [intmed-intresults[intsort[pct16]],intresults[intsort[pct84]]-intmed]
def LS_bootstrap_err_est(t, y, dy, omega, generalized=True, subtract_mean=True, N_bootstraps=100, random_state=None, hist=False, plot_hist=True,Nbins=200): """Use a bootstrap analysis to compute Lomb-Scargle error estimation Parameters ---------- The first set of parameters are passed to the lomb_scargle algorithm t : array_like sequence of times y : array_like sequence of observations dy : array_like sequence of observational errors omega : array_like frequencies at which to evaluate p(omega) generalized : bool if True (default) use generalized lomb-scargle method otherwise, use classic lomb-scargle. subtract_mean : bool if True (default) subtract the sample mean from the data before computing the periodogram. Only referenced if generalized is False Remaining parameters control the bootstrap N_bootstraps : int number of bootstraps random_state : None, int, or RandomState object random seed, or random number generator Returns ------- D : ndarray distribution of the height of the highest peak """ random_state = check_random_state(random_state) t = np.asarray(t) y = np.asarray(y) dy = np.asarray(dy) + np.zeros_like(y) D = np.zeros(N_bootstraps) omegaD= np.zeros(N_bootstraps) for i in range(N_bootstraps): ind = random_state.randint(0, len(y), len(y)) #print(ind) #print(t[ind], y[ind], dy[ind]) #print(subtract_mean) ###el vector de tiempo es patológico, dado que hay una observación aislada en dos días distintos, aunque no sea correcto del todo vamos a conservar los dos puntos en esas noches en concreto para evitar que el mook periodogram de resultados raros (i.e. periodo=0) ##ind[-2]=11 ##ind[-1]=-1 p = lomb_scargle(t[ind], y[ind], dy[ind], omega, generalized=generalized, subtract_mean=subtract_mean) D[i] = p.max() omegaD[i]=omega[p.argmax()] #if omegaD[i]==min(omega): # from matplotlib import pyplot as plt # plt.plot(omega,p) # plt.figure() # print(t[ind],y[ind],dy[ind]) if hist: if plot_hist: from matplotlib import pyplot as plt frecD=omegaD.copy()/(2*np.pi) plt.figure('bootstrap hist') plt.hist(frecD,normed=True, bins=Nbins) plt.hist(frecD,normed=True,histtype='step') plt.figure('bootstrap cumhist') Xcum=np.sort(D) Ycum=np.array(range(N_bootstraps))/float(N_bootstraps) plt.plot(Xcum,Ycum) #plt.xlim(Xcum,Xcum) plt.grid() plt.minorticks_on() plt.xlabel('') return D,omegaD else: return D
def LS_null_hypotesis(t, y, dy, omega, generalized=False, subtract_mean=False, random_state=None, N_mock=1000, hist=False, plot_hist=True,Nbins=200): """Use a monte carlo simulation to compute Lomb-Scargle null hypotesis periodogram shuffling the vector {t U y_obs} Parameters ---------- The first set of parameters are passed to the lomb_scargle algorithm t : array_like sequence of times y : array_like sequence of observations dy : array_like sequence of observational errors omega : array_like frequencies at which to evaluate p(omega) generalized : bool if True (default) use generalized lomb-scargle method otherwise, use classic lomb-scargle. subtract_mean : bool if True (default) subtract the sample mean from the data before computing the periodogram. Only referenced if generalized is False Remaining parameters control the bootstrap N_mock : int number of simulations random_state : None, int, or RandomState object random seed, or random number generator Returns ------- D : ndarray distribution of the height of the highest peak if hist=True: omegaD : ndarray distribution of the angular frecuencies corresponding to D """ #print(len(y)) #print(len(dy)) random_state = check_random_state(random_state) t = np.asarray(t) #dy = np.ones_like(y) y = np.asarray(y) dy = np.asarray(dy) + np.zeros_like(y) D = np.zeros(N_mock) omegaD= np.zeros(N_mock) mock_vector=np.append(y,t) mock_dy=dy.copy() for i in range(100): np.random.shuffle(mock_vector) print(mock_vector) for i in range(N_mock): #ind = random_state.randint(0, len(y), len(y)) #y = np.random.normal(len(t)) np.random.shuffle(mock_vector) np.random.shuffle(mock_dy) p = lomb_scargle(mock_vector[:len(t)], mock_vector[-len(t):], mock_dy, omega,\ generalized=generalized, subtract_mean=subtract_mean) D[i] = p.max() omegaD[i]=omega[p.argmax()] #max_PS_mock=np.max(PS_mock_all,axis=0) if hist: if plot_hist: from matplotlib import pyplot as plt frecD=omegaD.copy()/(2*np.pi) plt.figure('null hypothesis hist') plt.hist(frecD,normed=True, bins=Nbins) plt.hist(frecD,normed=True,histtype='step') plt.figure('null hypotesis cumhist') Xcum=np.sort(D) Ycum=np.array(range(N_mock))/float(N_mock) plt.plot(Xcum,Ycum) #plt.xlim(Xcum,Xcum) plt.grid() plt.minorticks_on() plt.xlabel('') return D,omegaD else: return D
# and bisector fits, plot it, and print out its coefficients (slope and # intercept expressed assuming yy = slope*xx + intercept) # use bootstrap resampling of the data to determine the 68% confidence # intervals for the slope and intercept you computed above # (hint: you cannot use the pre-made bootstrap/smoothedbootstrap functions # but must construct a custom bootstrap as Ivezic Fig. 3.24 had to do) nboot=100 Ndata=len(xx) sloperesults = np.zeros(nboot) intresults = np.zeros(nboot) random_state=None rng = check_random_state(random_state) ind = rng.randint(Ndata, size=(Ndata,nboot)) #for k in range(nboot): #slopesort=np.argsort(sloperesults) #intsort=np.argsort(intresults) #print "68%% confidence interval for slope: %0.2f -- %0.2f" % (,) #print "68%% confidence interval for intercept: %0.2f -- %0.2f" % (,) # now suppose we wish to determine whether a 2nd order model is # superior to a 1st order one for this data set -- this problem is too # hard for a 3-hour test if we allow for errors in both variables, so # we'll assume from now on that all the scatter is in the yy direction
def compute_CDF_bounds_by_MC(sample, sample_errp, sample_errm=None, sample_ll=None, sample_ul=None, ll_max_val=None, ul_min_val=None, weights=None, weights_err=None, weqs=False, confidence=95.0, bins=None, positive=False, precision=1000, precision_pdf=1000, N_MC=1000, bootstrap=False, verbose=False, random_state=None, show_plot=False, ax=None, color='k', show_median=True, **kwargs): """ Function to compute the lower and upper bounds of a cumulative distribution function for quantities with errors. Note if this function takes too long, try reducing N_MC or precision. Returns the middle point for the bins, the median CDF computed, the lower and upper CDF envelopes and the figure on which it has plotted. Parameters: ----------- sample : [numpy array] Numpy array of the data for which to compute the bounds. sample_errp : [numpy array] Numpy array of the positive error on the data for which to compute the bounds. sample_errm : [numpy array] Default is None Numpy array of the negative error on the data for which to compute the bounds. If None, the errors are assumed to be symetric and equal to sample_errp sample_ll : [numpy array] Default is None Numpy array of the lower limits on the data for which to compute the bounds. Expects 0 if not a limit, 1 if a limit. sample_ul : [numpy array] Default is None Numpy array of the upper limits on the data for which to compute the bounds. Expects 0 if not a limit, 1 if a limit. weights : [numpy array] Default is None Numpy array of the weights on the data for which to compute the bounds. If None, the weights are assumed to be equal and no weighting is computed. weights_err : [numpy array or list of 2 numpy arrays] Default is None Numpy array of the error on the weights. If a numpy array is provided, the weights are drawn from an gaussian with sigma equal to weights. If a list of 2 numpy arrays is provided, the first numpy array is assumed to be the plus error and the second the minus. ex : [weights_errp, weights_errm] weqs : [bool] Default is False. If True, the weights are assumed equal to the sample and are not redrawn. Use this if you are weighting the sample by itself. confidence : [float] Default is 95.0 Confidence level of the bounds (in percent). bins : [numpy array] Default is None Bins in which to compute the cumulative distribution function. If None, will use the conservative assumption that the smallest bin value is the min value of the sample minus 5 time the maximum error of the sample (max value plus 5 times max error for largest bin value). Use 'precision' argument to controle the number of bins in this case. positive : [bool] Default is False Set to True if the quantity can not be negative (a distance for example). bootstrap : [bool] Default is False Set to True to perform bootstrap resampling. ll_max_val : [numpy array] Default is None Numpy array of the minimum value to use as a lower bound for the prior on lower limits. If lower limits are provided (with sample_ll) but ll_max_val is not specified, by default will use the maximum value of the sample plus 5 times the largest error ul_min_val : [numpy array] Default is None Numpy array of the maximum value to use as an upper bound for the prior on upper limits. If upper limits are provided (with sample_ul) but ul_min_val is not specified, by default will use the minimum value of the sample minus 5 times the largest error precision : [int] Default is 1000 Numbers of bins in which to compute the cumulative distribution function. precision_pdf : [int] Default is 1000 Numbers of points with which to sample the probability distribution function of each data point. N_MC : [int] Default is 1000 Numbers of Monte Carlo realizations of the sample. show_plot : [bool] Default is False If True, will create plots. ax : [matplotlib axes instance] Default is None If specified, will plot the median and upper and lower bounds on the ax. color : [str] Default is 'k' Color of the CDF to plot. **kwargs : [dict] Any matplotlib key words to pass to the median plot. Returns ------- bins_mid : [array] Numpy array of values for the middle of the bins used in the cumulative distribution function median : [array] Numpy array of values for the median at each bin. lower : [array] Numpy array of values for the lower bound at each bin. upper : [array] Numpy array of values for the upper bound at each bin. fig : [matplotlib Figure] The figure on which is drawn the plot. Returns None if no show_plot is False. """ if show_plot: import matplotlib.pyplot as plt from matplotlib.lines import Line2D # Check inputs if not isinstance(sample, np.ndarray): raise TypeError('sample must be numpy array.') if verbose: log.info("In compute_CDF_bounds_by_MC: initializing...") # If plot is demanded but no ax is given, create figure and plot also indivudal points' PDFs if show_plot and (ax is None): fig = plt.figure(figsize=(8, 5)) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212, sharex=ax1) fig.subplots_adjust(hspace=0) plt.setp(ax1.get_xticklabels(), visible=False) ax1.tick_params(axis='x', which='both', bottom=False) elif show_plot: ax1 = ax fig = plt.gcf() else: fig = None sample_len = len(sample) # If no negative error, assume errors are symmetric if sample_errm is None: sample_errm = sample_errp # Check if limits if sample_ll is None: sample_ll = np.zeros(sample_len) else: if len(sample_ll) != sample_len: raise IOError("sample_ll and sample must have same length") if ll_max_val is None: ll_max_val = (sample.max() + 5 * sample_errm.max()) * np.ones(sample_len) if sample_ul is None: sample_ul = np.zeros(sample_len) else: if len(sample_ul) != sample_len: raise IOError("sample_ul and sample must have same length") if ul_min_val is None: ul_min_val = (sample.min() - 5 * sample_errm.max()) * np.ones(sample_len) # Create percentiles: lower_percentile = (1. - confidence / 100.) / 2. upper_percentile = 1. - lower_percentile # Create bins if bins is None: bin_min = sample.min() - 5 * sample_errm.max() bin_max = sample.max() + 5 * sample_errp.max() bins = np.linspace(bin_min, bin_max, precision + 1) else: precision = len(bins) - 1 bin_min = bins.min() bin_max = bins.max() # Create middle value of the bins, warning this has a length of bins - 1 (i.e. of length precision here) # This is essentially used for plotting purposes bins_mid = 0.5 * (bins[1:] + bins[:-1]) # Create a array where each line is one MC realization drawings of the sample sample_real = np.zeros((N_MC, sample_len)) # Create array for the weights if weights is not None: if (len(weights) != sample_len): raise IOError("weights and sample must have same length") # Create arrays for the realizations of the weights if they have errors if weights_err is not None: weights_real = np.zeros(sample_real.shape) # If list of length 2 provided, interpret as follows asymmetric errors if isinstance(weights_err, list) & len(weights_err) == 2: weights_errp = weights_err[0] weights_errm = weights_err[1] # Otherwise symmetric errors elif isinstance(weights_err, np.ndarray): weights_errp = weights_err weights_errm = weights_err # If no error on the weights else: weights_real = weights * np.ones(sample_real.shape) # If no weights, use weights = 1 for everything else: weights_real = np.ones(sample_real.shape) # Create array that will hold the PDF for each point in the sample sample_pdf = np.zeros((precision_pdf, sample_len)) if verbose: log.info( "In compute_CDF_bounds_by_MC: starting Monte Carlo drawings...") # For every point in the sample: # - Generate N_MC realizations of its value following an asymmetric gaussian pdf with standard deviations being the errors on the point # - Calculate its PDF sample_real = MC_realization(sample, sample_errp, sample_errm=sample_errm, sample_ll=sample_ll, sample_ul=sample_ul, ll_max_val=ll_max_val, ul_min_val=ul_min_val, N_MC=N_MC, positive=positive) # - If there are weights with errors, generate N_MC realizations of the weight value following same procedure as above if weqs: weights_real = sample_real.copy() elif (weights_err is not None) and (weights is not None): for i in range(sample_len): weights_real[:, i] = asym_gaussian_draw(weights[i], sigma1=weights_errm[i], sigma2=weights_errp[i], nb_draws=N_MC, positive=True) # Visualize the PDFs of each individual point in the sample if show_plot and (ax is None): if verbose: log.info( "In compute_CDF_bounds_by_MC: starting drawings of individual point PDFs for plotting..." ) for i in range(sample_len): if (sample_ll[i] == 0) & (sample_ul[i] == 0): x, sample_pdf[:, i] = asym_gaussian_pdf(sample[i], x_min=bin_min, x_max=bin_max, precision=len(sample_pdf), sigma1=sample_errm[i], sigma2=sample_errp[i], nb_draws=N_MC, positive=positive) # If limits, create flat prior over the desired range elif (sample_ll[i] == 1) & (sample_ul[i] == 0): x = np.linspace(bin_min, bin_max, len(sample_pdf)) a = 1. / np.abs(sample[i] - ll_max_val[i]) sample_pdf[:, i] = np.zeros(len(sample_pdf)) sample_pdf[:, i][np.where((x >= sample[i]) & (x <= ll_max_val[i]))] = a elif (sample_ll[i] == 0) & (sample_ul[i] == 1): x = np.linspace(bin_min, bin_max, len(sample_pdf)) a = 1. / np.abs(ul_min_val[i] - sample[i]) sample_pdf[:, i] = np.zeros(len(sample_pdf)) sample_pdf[:, i][np.where((x <= sample[i]) & (x >= ul_min_val[i]))] = a if positive: sample_pdf[:, i][np.where(x <= 0)] = 0.0 else: x = np.linspace(bin_min, bin_max, len(sample_pdf)) a = 1. / np.abs(ul_min_val[i] - ll_max_val[i]) sample_pdf[:, i] = np.zeros(len(sample_pdf)) sample_pdf[:, i][np.where((x >= ul_min_val[i]) & (x <= ll_max_val[i]))] = a ax2.plot(x, sample_pdf[:, i]) # Create the total PDF and CDF from the individual PDFs of the points in the sample summed_pdf = np.sum(sample_pdf, axis=1) summed_pdf = summed_pdf / np.sum(summed_pdf * (x[1] - x[0])) ax2.plot(x, summed_pdf, c='k', lw=2, label='Summed PDF') ax2.legend() # Perform bootstrap if bootstrap: rng = check_random_state(random_state) ind = rng.randint(sample_len, size=(N_MC, sample_len)) sample_bootstrapped = np.zeros(sample_real.shape) weights_bootstrapped = np.zeros(weights_real.shape) if verbose: log.info("In compute_CDF_bounds_by_MC: starting bootstraps...") for i in range(N_MC): sample_bootstrapped[i] = sample_real[i][ind[i]] weights_bootstrapped[i] = weights_real[i][ind[i]] else: sample_bootstrapped = sample_real weights_bootstrapped = weights_real if verbose: log.info("In compute_CDF_bounds_by_MC: computing CDF...") # Compute the PDF and CDF for each realization CDF_real = np.zeros((N_MC, precision)) PDF_real = np.zeros((N_MC, precision)) for i in range(N_MC): hist, _bins = np.histogram(sample_bootstrapped[i], bins=bins, weights=weights_bootstrapped[i]) PDF_real[i, :] = hist / float(np.sum(hist) * (_bins[1] - _bins[0])) CDF_real[i, :] = np.cumsum(hist).astype(float) / float(np.sum(hist)) # Compute the percentiles for each bin lower = np.zeros(precision) median = np.zeros(precision) upper = np.zeros(precision) for i in range(precision): q = mstats.mquantiles(CDF_real[:, i], prob=[lower_percentile, 0.5, upper_percentile]) median[i] = q[1] upper[i] = q[2] lower[i] = q[0] if show_plot: if verbose: log.info("In compute_CDF_bounds_by_MC: plotting...") if show_median: artist, = ax1.plot(bins_mid, median, drawstyle='steps-mid', c=color, **kwargs) # Add the first and last line to make the plot look better plot_color = plt.getp(artist, 'color') plot_ls = plt.getp(artist, 'linestyle') plot_lw = plt.getp(artist, 'linewidth') plot_zorder = plt.getp(artist, 'zorder') imin = np.where(median > 0)[0][0] imax = np.where(median == 1)[0][0] xmin = bins_mid[imin] xmax = bins_mid[imax] bottom_line = Line2D([xmin, xmin], [0, median[imin]], color=plot_color, linestyle=plot_ls, linewidth=plot_lw, zorder=plot_zorder) ax1.add_line(bottom_line) top_line = Line2D([xmax, xmax + (xmax - xmin)], [median[imax], median[imax]], color=plot_color, linestyle=plot_ls, linewidth=plot_lw, zorder=plot_zorder) ax1.add_line(top_line) ax1.fill_between(bins_mid, lower, upper, step='mid', color=color, alpha=0.3, zorder=plot_zorder - 1) ax1.plot(bins_mid, lower, drawstyle='steps-mid', lw=0.7, c=color, zorder=plot_zorder) ax1.plot(bins_mid, upper, drawstyle='steps-mid', lw=0.7, c=color, zorder=plot_zorder) return bins_mid, median, lower, upper, fig
def LS_bootstrap_sig(t, y, dy, omega, generalized=True, subtract_mean=True, N_bootstraps=100, random_state=None, hist=False, plot_hist=True,Nbins=200): """Use a bootstrap analysis to compute Lomb-Scargle significance Parameters ---------- The first set of parameters are passed to the lomb_scargle algorithm t : array_like sequence of times y : array_like sequence of observations dy : array_like sequence of observational errors omega : array_like frequencies at which to evaluate p(omega) generalized : bool if True (default) use generalized lomb-scargle method otherwise, use classic lomb-scargle. subtract_mean : bool if True (default) subtract the sample mean from the data before computing the periodogram. Only referenced if generalized is False Remaining parameters control the bootstrap N_bootstraps : int number of bootstraps random_state : None, int, or RandomState object random seed, or random number generator Returns ------- D : ndarray distribution of the height of the highest peak """ random_state = check_random_state(random_state) t = np.asarray(t) y = np.asarray(y) dy = np.asarray(dy) + np.zeros_like(y) D = np.zeros(N_bootstraps) omegaD= np.zeros(N_bootstraps) for i in range(N_bootstraps): ind = random_state.randint(0, len(y), len(y)) #print[ind] p = lomb_scargle(t, y[ind], dy[ind], omega, generalized=generalized, subtract_mean=subtract_mean) D[i] = p.max() omegaD[i]=omega[p.argmax()] if hist: if plot_hist: from matplotlib import pyplot as plt frecD=omegaD.copy()/(2*np.pi) plt.figure('bootstrap hist') plt.hist(frecD,normed=True, bins=Nbins) plt.hist(frecD,normed=True,histtype='step') plt.figure('bootstrap cumhist') Xcum=np.sort(D) Ycum=np.array(range(N_bootstraps))/float(N_bootstraps) plt.plot(Xcum,Ycum) #plt.xlim(Xcum,Xcum) plt.grid() plt.minorticks_on() plt.xlabel('') return D,omegaD else: return D