Exemple #1
0
def bootstrap(data,
              n_bootstraps,
              user_statistic,
              kwargs=None,
              pass_indices=False,
              random_state=None):
    """Compute bootstraped statistics of a dataset.

    Parameters
    ----------
    data : array_like
        An n-dimensional data array of size n_samples by n_attributes
    n_bootstraps : integer
        the number of bootstrap samples to compute.  Note that internally,
        two arrays of size (n_bootstraps, n_samples) will be allocated.
        For very large numbers of bootstraps, this can cause memory issues.
    user_statistic : function
        The statistic to be computed.  This should take an array of data
        of size (n_bootstraps, n_samples) and return the row-wise statistics
        of the data.
    kwargs : dictionary (optional)
        A dictionary of keyword arguments to be passed to the
        user_statistic function.
    pass_indices : boolean (optional)
        if True, then the indices of the points rather than the points
        themselves are passed to `user_statistic`
    random_state: RandomState or an int seed (0 by default)
        A random number generator instance

    Returns
    -------
    distribution : ndarray
        the bootstrapped distribution of statistics (length = n_bootstraps)
    """
    # we don't set kwargs={} by default in the argument list, because using
    # a mutable type as a default argument can lead to strange results
    if kwargs is None:
        kwargs = {}

    rng = check_random_state(random_state)
    data = np.asarray(data)
    if data.ndim != 1:
        n_samples = data.shape[0]
        warnings.warn(
            "bootstrap data are n-dimensional: assuming ordered n_samples by n_attributes"
        )
    else:
        n_samples = data.size

    # Generate random indices with repetition
    ind = rng.randint(n_samples, size=(n_bootstraps, n_samples))
    data = data[ind].reshape(-1, data[ind].shape[-1])
    # Call the function
    if pass_indices:
        stat_bootstrap = user_statistic(ind, **kwargs)
    else:
        stat_bootstrap = user_statistic(data, **kwargs)

    # compute the statistic on the data
    return stat_bootstrap
def smoothedbootstrap(data,
                      n_bootstraps,
                      user_statistic,
                      kwargs=None,
                      random_state=None):
    """Compute smoothed bootstrapped statistics of a data set.

    Parameters
    ----------
    data : array_like
        A 1-dimensional data array of size n_samples
    n_bootstraps : integer
        the number of bootstrap samples to compute.  Note that internally,
        two arrays of size (n_bootstraps, n_samples) will be allocated.
        For very large numbers of bootstraps, this can cause memory issues.
    user_statistic : function
        The statistic to be computed.  This should take an array of data
        of size (n_bootstraps, n_samples) and return the row-wise statistics
        of the data.
    kwargs : dictionary (optional)
        A dictionary of keyword arguments to be passed to the
        user_statistic function.
    random_state: RandomState or an int seed (0 by default)
        A random number generator instance

    Returns
    -------
    distribution : ndarray
        the bootstrapped distribution of statistics (length = n_bootstraps)
    """
    # we don't set kwargs={} by default in the argument list, because using
    # a mutable type as a default argument can lead to strange results
    if kwargs is None:
        kwargs = {}

    rng = check_random_state(random_state)

    data = np.asarray(data)
    n_datapts = data.size

    if data.ndim != 1:
        raise ValueError("bootstrap expects 1-dimensional data")

    # Generate random indices with repetition
    ind = rng.randint(n_datapts, size=(n_bootstraps, n_datapts))

    # smoothing noise
    noisemean = 0.
    noisesigma = np.std(data, ddof=1) / np.sqrt(n_datapts)
    noise = npr.normal(noisemean, noisesigma, (n_bootstraps, n_datapts))
    databroadcast = data[ind] + noise

    # Call the function
    stat_bootstrap = user_statistic(databroadcast, **kwargs)

    # compute the statistic on the data
    return stat_bootstrap
def smoothedbootstrap(data, n_bootstraps, user_statistic, kwargs=None,
              random_state=None):
    """Compute smoothed bootstrapped statistics of a data set.

    Parameters
    ----------
    data : array_like
        A 1-dimensional data array of size n_samples
    n_bootstraps : integer
        the number of bootstrap samples to compute.  Note that internally,
        two arrays of size (n_bootstraps, n_samples) will be allocated.
        For very large numbers of bootstraps, this can cause memory issues.
    user_statistic : function
        The statistic to be computed.  This should take an array of data
        of size (n_bootstraps, n_samples) and return the row-wise statistics
        of the data.
    kwargs : dictionary (optional)
        A dictionary of keyword arguments to be passed to the
        user_statistic function.
    random_state: RandomState or an int seed (0 by default)
        A random number generator instance

    Returns
    -------
    distribution : ndarray
        the bootstrapped distribution of statistics (length = n_bootstraps)
    """
    # we don't set kwargs={} by default in the argument list, because using
    # a mutable type as a default argument can lead to strange results
    if kwargs is None:
        kwargs = {}

    rng = check_random_state(random_state)

    data = np.asarray(data)
    n_datapts = data.size

    if data.ndim != 1:
        raise ValueError("bootstrap expects 1-dimensional data")

    # Generate random indices with repetition
    ind = rng.randint(n_datapts, size=(n_bootstraps, n_datapts))
    
    # smoothing noise
    noisemean = 0.
    noisesigma = np.std(data,ddof=1) / np.sqrt(n_datapts)
    noise = npr.normal(noisemean,noisesigma,(n_bootstraps, n_datapts))
    databroadcast = data[ind] + noise

    # Call the function
    stat_bootstrap = user_statistic(databroadcast, **kwargs)

    # compute the statistic on the data
    return stat_bootstrap
Exemple #4
0
def bootstrap(data, n_bootstraps, user_statistic, kwargs=None,
              pass_indices=False, random_state=None):
    """Compute bootstraped statistics of a dataset.

    Parameters
    ----------
    data : array_like
        A 1-dimensional data array of size n_samples
    n_bootstraps : integer
        the number of bootstrap samples to compute.  Note that internally,
        two arrays of size (n_bootstraps, n_samples) will be allocated.
        For very large numbers of bootstraps, this can cause memory issues.
    user_statistic : function
        The statistic to be computed.  This should take an array of data
        of size (n_bootstraps, n_samples) and return the row-wise statistics
        of the data.
    kwargs : dictionary (optional)
        A dictionary of keyword arguments to be passed to the
        user_statistic function.
    pass_indices : boolean (optional)
        if True, then the indices of the points rather than the points
        themselves are passed to `user_statistic`
    random_state: RandomState or an int seed (0 by default)
        A random number generator instance

    Returns
    -------
    distribution : ndarray
        the bootstrapped distribution of statistics (length = n_bootstraps)
    """
    # we don't set kwargs={} by default in the argument list, because using
    # a mutable type as a default argument can lead to strange results
    if kwargs is None:
        kwargs = {}

    rng = check_random_state(random_state)

    data = np.asarray(data)
    n_samples = data.size

    if data.ndim != 1:
        raise ValueError("bootstrap expects 1-dimensional data")

    # Generate random indices with repetition
    ind = rng.randint(n_samples, size=(n_bootstraps, n_samples))

    # Call the function
    if pass_indices:
        stat_bootstrap = user_statistic(ind, **kwargs)
    else:
        stat_bootstrap = user_statistic(data[ind], **kwargs)

    # compute the statistic on the data
    return stat_bootstrap
Exemple #5
0
def bootstrap_correln_results(xydata, Nbootstraps, random_state=None):
    xdata = xydata[:, 0]
    ydata = xydata[:, 1]
    Ndata = len(xdata)
    Ntests = 3
    results = np.zeros((Ntests, Nbootstraps))
    rng = check_random_state(random_state)
    ind = rng.randint(Ndata, size=(Ndata, Nbootstraps))
    for k in range(Nbootstraps):
        for i, statistic in enumerate(
            [stats.pearsonr, stats.spearmanr, stats.kendalltau]):
            results[i, k] = statistic(xdata[ind[:, k]], ydata[ind[:, k]])[0]
    return results
def newdatafullbootstrap(runindex):
    # NOTE: all print and plot commands removed, would be blocked by
    # multiprocessing wrapper anyway
    nproc = 3
    
    # Generate a fresh seed
    npr.seed()
    
    # Generating fake data set to start with:
    alphatrue=2. # slope
    betatrue=5.  # intercept
    errs=2.5 # sigma (amplitude of errors)
    
    narr=50 # number of data points
    xvals = np.arange(narr) + 1. # xvals range from 1-51
    yvals = alphatrue*xvals + betatrue + npr.normal(0,errs,narr) # yvals 
    
    # Determine slope & y-intercept using least squares analytic solution 
    alphaest=(np.mean(xvals)*np.mean(yvals)-np.mean(xvals*yvals)) / \
       (np.mean(xvals)**2 -np.mean(xvals**2)) #  from derivation
    betaest= np.mean(yvals) - alphaest * np.mean(xvals) # calculate estimate of y-intercept from derivation
    # The MLE values of the slope and y-intercept are equivalent to the "least
    # squares" fit results.
    
    # Compute analytic uncertainties on slope and y-intercept 
    alphaunc = np.sqrt(np.sum((yvals - (alphaest*xvals+betaest))**2) / ((narr-2.)*(np.sum((xvals-np.mean(xvals))**2))))
    betaunc = np.sqrt((np.sum((yvals - (alphaest*xvals+betaest))**2) / (narr-2.)) * ((1./narr) + (np.mean(xvals)**2)/np.sum((xvals-np.mean(xvals))**2)) )
    
    # Solution using python solver np.polyfit
    # third parameter is order of fit, 1 for linear
# duplicate np.polyfit command below can be commented out now
#    pfit = np.polyfit(xvals, yvals, 1) # returns coeff. of highest order term first
    
    # Can also obtain parameter uncertainties from the diagonal terms of the covariance
    # matrix, which is the inverse of the Hessian matrix and
    # can be computed in np.polyfit by setting cov='True'
    pfit,covp = np.polyfit(xvals, yvals, 1, cov='True') # returns coeff. of highest power first
    # setting cov='True' returns the covariance matrix
    
    # BOOTSTRAP!
    
    npars = 2 # slope and intercept
    nboot = 1000 # usually want at least 1000
    rng = check_random_state(None)
    ind = rng.randint(narr, size=(narr,nboot))
    #bootresults = np.zeros((npars,nboot))
    bootres = jl.Parallel(n_jobs = nproc)(jl.delayed(np.polyfit)(xvals[ind[:,iboot]], yvals[ind[:,iboot]], 1) for iboot in xrange(nboot))
    sloperesults,intresults = zip(*bootres)
    #for iboot in xrange(nboot):
    #    bootresults[:,iboot] = np.polyfit(xvals[ind[:,iboot]], yvals[ind[:,iboot]], 1)
    #sloperesults = bootresults[0,:]
    #intresults = bootresults[1,:]
    slopesort = np.argsort(sloperesults)
    slopemed = np.median(sloperesults)
    pct16 = int(round(0.16*nboot))
    pct84 = int(round(0.84*nboot))
    slope68pcterrs = [slopemed-sloperesults[slopesort[pct16]],sloperesults[slopesort[pct84]]-slopemed]
    intsort = np.argsort(intresults)
    intmed = np.median(intresults)
    int68pcterrs = [intmed-intresults[intsort[pct16]],intresults[intsort[pct84]]-intmed]
    
    slope_err_ratio = 0.5*(np.sum(slope68pcterrs))/np.sqrt(covp[0,0])
    int_err_ratio = 0.5*(np.sum(int68pcterrs))/np.sqrt(covp[1,1])
    slope_err_ratio2 = 0.5*(np.sum(slope68pcterrs))/alphaunc
    int_err_ratio2 = 0.5*(np.sum(int68pcterrs))/betaunc
    
    return runindex, slope_err_ratio, int_err_ratio, slope_err_ratio2, int_err_ratio2
Exemple #7
0
# Can also obtain parameter uncertainties from the diagonal terms of the covariance
# matrix, which is the inverse of the Hessian matrix and
# can be computed in np.polyfit by setting cov='True'

pfit, covp = np.polyfit(xvals, yvals, 1,
                        cov='True')  # returns coeff. of highest power first
# setting cov='True' returns the covariance matrix
print("slope is %0.7f +- %0.7f" % (pfit[0], np.sqrt(covp[0, 0])))
print("intercept is %0.7f +- %0.7f" % (pfit[1], np.sqrt(covp[1, 1])))

# BOOTSTRAP!

npars = 2  # slope and intercept
nboot = 10000  # usually want at least 1000
rng = check_random_state(None)
ind = rng.randint(narr, size=(narr, nboot))
bootresults = np.zeros((npars, nboot))
for iboot in xrange(nboot):
    bootresults[:, iboot] = np.polyfit(xvals[ind[:, iboot]],
                                       yvals[ind[:, iboot]], 1)
sloperesults = bootresults[0, :]
intresults = bootresults[1, :]
slopesort = np.argsort(sloperesults)
slopemed = np.median(sloperesults)
pct16 = int(round(0.16 * nboot))
pct84 = int(round(0.84 * nboot))
slope68pcterrs = [
    slopemed - sloperesults[slopesort[pct16]],
    sloperesults[slopesort[pct84]] - slopemed
]
Exemple #8
0
# select the preferred frequentist linear fit between the forward, inverse,
# and bisector fits, plot it, and print out its coefficients (slope and
# intercept expressed assuming yy = slope*xx + intercept)

# use bootstrap resampling of the data to determine the 68% confidence
# intervals for the slope and intercept you computed above
# (hint: you cannot use the pre-made bootstrap/smoothedbootstrap functions
# but must construct a custom bootstrap as Ivezic Fig. 3.24 had to do)

nboot = 100
Ndata = len(xx)
sloperesults = np.zeros(nboot)
intresults = np.zeros(nboot)
random_state = None
rng = check_random_state(random_state)
ind = rng.randint(Ndata, size=(Ndata, nboot))
#for k in range(nboot):

#slopesort=np.argsort(sloperesults)
#intsort=np.argsort(intresults)

#print "68%% confidence interval for slope: %0.2f -- %0.2f" % (,)
#print "68%% confidence interval for intercept: %0.2f -- %0.2f" % (,)

# now suppose we wish to determine whether a 2nd order model is
# superior to a 1st order one for this data set -- this problem is too
# hard for a 3-hour test if we allow for errors in both variables, so
# we'll assume from now on that all the scatter is in the yy direction

# perform 1st and 2nd order polyfit results
def LS_window_white_noise(t, omega, y=0, dy=1,
                           generalized=False, subtract_mean=False,
                           random_state=None, N_mock=100,
                           hist=False, plot_hist=True,Nbins=200):
    """Use a monte carlo simulation to compute Lomb-Scargle white noise 
        significance for the given spectral window
    Parameters
    ----------
    The first set of parameters are passed to the lomb_scargle algorithm
    t : array_like
        sequence of times
    omega : array_like
        frequencies at which to evaluate p(omega)
    generalized : bool
        if True (default) use generalized lomb-scargle method
        otherwise, use classic lomb-scargle.
    subtract_mean : bool
        if True (default) subtract the sample mean from the data before
        computing the periodogram.  Only referenced if generalized is False
    Remaining parameters control the bootstrap
    N_mock : int
        number of simulations
    random_state : None, int, or RandomState object
        random seed, or random number generator
    Returns
    -------
    D : ndarray
        distribution of the height of the highest peak
    if hist=True:
        omegaD : ndarray
            distribution of the angular frecuencies corresponding to D
    """
    random_state = check_random_state(random_state)
    t = np.asarray(t)
    #dy = np.ones_like(y)

    D = np.zeros(N_mock)
    omegaD= np.zeros(N_mock)
    #PS_mock_all=[]
    #PS_mock_max=np.array([])
    #omega_mock_max=np.array([])
    for i in range(N_mock):
        #ind = random_state.randint(0, len(y), len(y))
        y = np.random.normal(y, dy , size=len(t))
        #print y 
        p = lomb_scargle(t, y, dy, omega,
                         generalized=generalized, subtract_mean=subtract_mean)
        #print p
        D[i] = p.max()
        omegaD[i]=omega[p.argmax()]
        
        
    #max_PS_mock=np.max(PS_mock_all,axis=0)
    if hist:
        
        if plot_hist:
            from matplotlib import pyplot as plt
            frecD=omegaD.copy()/(2*np.pi)
            
            plt.figure('white noise peak hist')
            plt.hist(frecD,normed=True, bins=Nbins)
            plt.hist(frecD,normed=True,histtype='step')

            plt.figure('white noise peak cumhist')
            Xcum=np.sort(D)
            Ycum=np.array(range(N_mock))/float(N_mock)
            plt.plot(Xcum,Ycum)
            #plt.xlim(Xcum,Xcum)
            plt.grid()
            plt.minorticks_on()
            plt.xlabel('')

        return D,omegaD
    else:
        return D
def two_point_angular(ra, dec, bins, BT_D=None, BT_R=None,
                      method='standard', ra_R=None, dec_R=None,
                      random_state=None, return_trees=False, verbose=False,
                      RR=None, return_RR=False, return_DD=False):
    """
    Angular two-point correlation function

    A separate function is needed because angular distances are not
    euclidean, and random sampling needs to take into account the
    spherical volume element.

    There are a number of options for what gets returned.  The order
    will always be correlation_function, data_balltree, random_balltree,
    random_random, data_data.  If the user asks for a subset of those, the
    list will be shorter but the order will be maintained.

    Parameters
    ----------
    ra : array_like
        input right ascention, shape = (n_samples,)
    dec : array_like
        input declination, shape = (n_samples,)
    bins : array_like
        bins within which to compute the 2-point correlation.
        shape = Nbins + 1
    BT_D : BallTree (optional)
        ball tree created with the data positions.  The positions given to
        the BallTree should be Euclidean and not angular
    BT_R : BallTree (optional)
        ball tree created with the random positions. The positions given to
        the BallTree should be Euclidean and not angular
    method : string (optional)
        "standard" or "landy-szalay".  Default is 'standard'
    ra_R, dec_R : array_like (optional if no BT_R)
        the random sample to be used.  If you pass BT_R
        as an argument, you must also pass the random sample
        it was made from with this
    random_state : integer, np.random.RandomState, or None (optional)
        specify the random state to use for generating background.
        Default is None
    RR : array-like, shape = Nbins (optional)
        If this exact set of randoms and theta bins has been
        run, you can supply the RR counts and not calculate them again.
        You also need the data if you're running landy-szalay.
    return_RR : boolean (optional)
        If you know you'll be running a CF with this
        exact same random sample and binning (like with a bootstrap),
        you can get the RR counts returned and feed them back in the
        next time.  Default is False.
    return_DD : boolean (optional)
        In case you want to fit to the pair counts rather than the w(theta)
        estimator, you can get this back too.  Default is False
    return_trees : boolean (optional)
        if True, the returns will include the ball trees for the data and
        random sets.  Default is False
        
    Returns
    -------
    corr : ndarray
        the estimate of the correlation function within each bin
        shape = Nbins
    data_tree : BallTree (optional)
        the ball tree used to calculate distances between objects
        quickly in the data.  only returned if return_trees == True
    random_tree : BallTree (optional)
        the ball tree used to calculate distances between objects
        quickly in the randomly generated set.  only returned if
        return_trees == True
    RR : ndarray (optional)
        the RR counts may be returned (if return_RR==True) and used
        again without recomputing if the theta bins and the random
        sample is exactly the same
    DD : ndarray (optional)
        the DD pair counts, returned if return_DD==True
    """
    ra = np.asarray(ra)
    dec = np.asarray(dec)
    rng = check_random_state(random_state)

    if method not in ['standard', 'landy-szalay']:
        raise ValueError("method must be 'standard' or 'landy-szalay'")

    if bins.ndim != 1:
        raise ValueError("bins must be a 1D array")

    if (ra.ndim != 1) or (dec.ndim != 1) or (ra.shape != dec.shape):
        raise ValueError('ra and dec must be 1-dimensional '
                         'arrays of the same length')

    n_features = len(ra)
    Nbins = len(bins) - 1

    # draw a random sample with N points
    if (ra_R is None) | (dec_R is None):
        if verbose:
            print "two_point_angular says: generating random sample"
        ra_R, dec_R = uniform_sphere((min(ra), max(ra)),
                                     (min(dec), max(dec)),
                                     2 * len(ra))

    data = np.asarray(ra_dec_to_xyz(ra, dec), order='F').T
    data_R = np.asarray(ra_dec_to_xyz(ra_R, dec_R), order='F').T

    # convert spherical bins to cartesian bins
    bins_transform = angular_dist_to_euclidean_dist(bins)
    if verbose:
        print "two_point_angular says: transform complete, calling two_point"
    return two_point(data, bins_transform, method=method, BT_D=BT_D,
                     BT_R=BT_R, data_R=data_R, random_state=rng,
                     return_trees=return_trees, RR=RR, return_RR=return_RR,
                     return_DD=return_DD)
def two_point(data, bins, BT_D=None, BT_R=None, method='standard',
              data_R=None, random_state=None, return_trees=False, 
              verbose=False, RR=None, return_RR=False, return_DD=False):
    #Edited by CW to allow user to supply more things and have more things
    #returned.
    """
    Two-point correlation function in Euclidean space.  Options to return
    a number of things.  What gets returned is up to the user but the order
    will always be correlation_function, data_balltree, random_balltree,
    random_random, data_data.  If the user asks for a subset of those, the
    list will be shorter but the order will be maintained.

    Parameters
    ----------
    data : array_like
        Input data, shape = [n_samples, n_features]
    bins : array_like
        Bins within which to compute the 2-point correlation.
        Shape = Nbins + 1
    BT_D : BallTree (optional)
        Ball tree created with the data positions
    BT_R : BallTree (optional)
        Ball tree created with the random positions
    method : string (optional)
        "standard" or "landy-szalay".  Default is 'standard'.
    data_R : array_like (optional if no BT_R)
        If specified, use this as the random comparison sample.  This must
        be included if you wish to use a pre-computed random ball tree
    random_state : integer, np.random.RandomState, or None (optional)
        Specify the random state to use for generating background.  Not
        used if the randoms are provided by the user.  Default is None
    RR : 1D array-like, shape = Nbins
        If this exact set of randoms and theta bins has been
        run, you can supply the RR counts and not calculate them again.
        You also need the data if you're running with method='landy-szalay'
    return_trees : boolean (optional)
        If True, the returns will include the data and random ball trees.
        Default is False.
    return_RR : boolean (optional)
        If you know you'll be running a CF with this
        exact same random sample and binning (like with a bootstrap),
        you can get the RR counts returned and feed them back in the
        next time
    return_DD : boolean (optional)
        In case you want to fit to the pair counts rather
        than the w(theta) estimator, you can get this back too
    verbose: boolean (optional)
        Determines whether or not the function narrates what it's doing.
        Default is False.
 
    Returns
    -------
    corr : ndarray
        the estimate of the correlation function within each bin
        shape = Nbins
    data_tree : BallTree (optional)
        the ball tree used to calculate distances between objects
        quickly in the data.  only returned if return_trees == True
    random_tree : BallTree (optional)
        the ball tree used to calculate distances between objects
        quickly in the randomly generated set.  only returned if
        return_trees == True
    RR : ndarray (optional)
        the RR counts may be returned (if return_RR==True) and used
        again without recomputing if the theta bins and the random
        sample is exactly the same
    DD : ndarray (optional)
        the DD pair counts, returned if return_DD==True   
    """
    data = np.asarray(data)
    bins = np.asarray(bins)
    rng = check_random_state(random_state)

    if method not in ['standard', 'landy-szalay']:
        raise ValueError("method must be 'standard' or 'landy-szalay'")

    if bins.ndim != 1:
        raise ValueError("bins must be a 1D array")

    if data.ndim == 1:
        data = data[:, np.newaxis]
    elif data.ndim != 2:
        raise ValueError("data should be 1D or 2D")

    n_samples, n_features = data.shape
    Nbins = len(bins) - 1

    # shuffle all but one axis to get background distribution
    if data_R is None:
        print "two_point says: generating random sample"
        data_R = data.copy()
        for i in range(n_features - 1):
            rng.shuffle(data_R[:, i])
    else:
        data_R = np.asarray(data_R)
        if (data_R.ndim != 2) or (data_R.shape[-1] != n_features):
            raise ValueError('data_R must have same n_features as data')

    factor = len(data_R) * 1. / len(data)

    if BT_D is None:
        if verbose:
            print "two_point says: computing BallTree for data"
        BT_D = BallTree(data)
    if BT_R is None:
        if verbose:
            print "two_point says: computing BallTree for random sample"
        BT_R = BallTree(data_R)

    counts_DD = np.zeros(Nbins + 1)
    counts_RR = np.zeros(Nbins + 1)

    if verbose:
        print "two_point says: working through the CF calc.  This could take a while"
    for i in range(Nbins + 1):
        counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i],
                                                count_only=True))
        if RR is None:
            counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i],
                                                    count_only=True))

    if verbose:
        print "two_point says: binning done!"
    DD = np.diff(counts_DD)
    if RR is None:
        RR = np.diff(counts_RR)

    # check for zero in the denominator
    RR_zero = (RR == 0)
    RR[RR_zero] = 1

    if method == 'standard':
        corr = factor**2 * DD / RR - 1
    elif method == 'landy-szalay':
        counts_DR = np.zeros(Nbins + 1)
        for i in range(Nbins + 1):
            counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i],
                                                    count_only=True))
        DR = np.diff(counts_DR)

        corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR

    corr[RR_zero] = np.nan

    to_return=corr
    if return_trees:
        to_return=[to_return]
        to_return.append(BT_D)
        to_return.append(BT_R)
    if return_RR:
        if not return_trees:
            to_return=[to_return]
        to_return.append(RR)
    if return_DD:
        if (not return_trees) and (not return_RR):
            to_return=[to_return]
        to_return.append(DD)        
    
    return to_return
Exemple #12
0
def generate_power_DRW(N,
                       dt,
                       sigma,
                       tau,
                       generate_complex=False,
                       random_state=None):
    import matplotlib.pyplot as plt
    from astroML.utils import check_random_state
    import numpy as np
    """Generate a power-law light curve
        This uses the method from Timmer & Koenig [1]_
           
        Parameters
           
        ----------
           
        N : integer
           
        Number of equal-spaced time steps to generate
           
        dt : float
           
        Spacing between time-steps
           
        beta : float
           
        Power-law index.  The spectrum will be (1 / f)^beta
           
        generate_complex : boolean (optional)
           
        if True, generate a complex time series rather than a real time series
           
        random_state : None, int, or np.random.RandomState instance (optional)
           
        random seed or random number generator
           
           
           
        Returns
           
        -------
           
        x : ndarray
           
        the length-N
           
           
           
        References
           
        ----------
           
        .. [1] Timmer, J. & Koenig, M. On Generating Power Law Noise. A&A 300:707
           
    """

    random_state = check_random_state(random_state)

    dt = float(dt)

    N = int(N)

    Npos = int(N / 2)

    Nneg = int((N - 1) / 2)

    domega = (2 * np.pi / dt / N)

    df = (1 / dt / N)

    if generate_complex:

        omega = domega * np.fft.ifftshift(np.arange(N) - int(N / 2))

    else:

        omega = domega * np.arange(Npos + 1)

    if generate_complex:

        freq = df * np.fft.ifftshift(np.arange(N) - int(N / 2))

    else:

        freq = df * np.arange(Npos + 1)

    x_fft = np.zeros(len(omega), dtype=complex)

    x_fft.real[1:] = random_state.normal(0, 1, len(omega) - 1)

    x_fft.imag[1:] = random_state.normal(0, 1, len(omega) - 1)

    #    x_fft[1:] *=2*sigma*np.sqrt(tau/(1+omega[1:]**2*tau**2))
    x_fft[1:] *= np.sqrt(2) * sigma * np.sqrt(tau /
                                              (1 + omega[1:]**2 * tau**2))

    x_fft[1:] *= (1. / np.sqrt(2))

    if (not generate_complex) and (N % 2 == 0):

        x_fft.imag[-1] = 0

    if generate_complex:

        x = np.fft.ifft(x_fft)

    else:

        x = np.fft.irfft(x_fft, N)

    return x
print("np.polyfit MLE y-intercept = %0.7f" %pfit[1])

# Can also obtain parameter uncertainties from the diagonal terms of the covariance
# matrix, which is the inverse of the Hessian matrix and
# can be computed in np.polyfit by setting cov='True'

pfit,covp = np.polyfit(xvals, yvals, 1, cov='True') # returns coeff. of highest power first
# setting cov='True' returns the covariance matrix
print("slope is %0.7f +- %0.7f" % (pfit[0], np.sqrt(covp[0,0])))
print("intercept is %0.7f +- %0.7f" % (pfit[1], np.sqrt(covp[1,1])))

# BOOTSTRAP!

npars = 2 # slope and intercept
nboot = 10000 # usually want at least 1000
rng = check_random_state(None)
ind = rng.randint(narr, size=(narr,nboot))
bootresults = np.zeros((npars,nboot))
for iboot in xrange(nboot):
    bootresults[:,iboot] = np.polyfit(xvals[ind[:,iboot]], yvals[ind[:,iboot]], 1)
sloperesults = bootresults[0,:]
intresults = bootresults[1,:]
slopesort = np.argsort(sloperesults)
slopemed = np.median(sloperesults)
pct16 = int(round(0.16*nboot))
pct84 = int(round(0.84*nboot))
slope68pcterrs = [slopemed-sloperesults[slopesort[pct16]],sloperesults[slopesort[pct84]]-slopemed]
intsort = np.argsort(intresults)
intmed = np.median(intresults)
int68pcterrs = [intmed-intresults[intsort[pct16]],intresults[intsort[pct84]]-intmed]
def LS_bootstrap_err_est(t, y, dy, omega,
                           generalized=True, subtract_mean=True,
                           N_bootstraps=100, random_state=None,
                           hist=False, plot_hist=True,Nbins=200):
    """Use a bootstrap analysis to compute Lomb-Scargle error estimation
    Parameters
    ----------
    The first set of parameters are passed to the lomb_scargle algorithm
    t : array_like
        sequence of times
    y : array_like
        sequence of observations
    dy : array_like
        sequence of observational errors
    omega : array_like
        frequencies at which to evaluate p(omega)
    generalized : bool
        if True (default) use generalized lomb-scargle method
        otherwise, use classic lomb-scargle.
    subtract_mean : bool
        if True (default) subtract the sample mean from the data before
        computing the periodogram.  Only referenced if generalized is False
    Remaining parameters control the bootstrap
    N_bootstraps : int
        number of bootstraps
    random_state : None, int, or RandomState object
        random seed, or random number generator
    Returns
    -------
    D : ndarray
        distribution of the height of the highest peak
    """
    random_state = check_random_state(random_state)
    t = np.asarray(t)
    y = np.asarray(y)
    dy = np.asarray(dy) + np.zeros_like(y)

    D = np.zeros(N_bootstraps)
    omegaD= np.zeros(N_bootstraps)
    
    for i in range(N_bootstraps):
        ind = random_state.randint(0, len(y), len(y))
        #print(ind)
        #print(t[ind], y[ind], dy[ind])
        #print(subtract_mean)
        ###el vector de tiempo es patológico, dado que hay una observación aislada en dos días distintos, aunque no sea correcto del todo vamos a conservar los dos puntos en esas noches en concreto para evitar que el mook periodogram de resultados raros (i.e. periodo=0) 
        ##ind[-2]=11
        ##ind[-1]=-1
        p = lomb_scargle(t[ind], y[ind], dy[ind], omega,
                         generalized=generalized, subtract_mean=subtract_mean)
        D[i] = p.max()
        omegaD[i]=omega[p.argmax()]
        
        #if omegaD[i]==min(omega):
        #    from matplotlib import pyplot as plt
        #    plt.plot(omega,p)
        #    plt.figure()
        #    print(t[ind],y[ind],dy[ind])
            
            
    if hist:
        
        if plot_hist:
            from matplotlib import pyplot as plt
            frecD=omegaD.copy()/(2*np.pi)
            
            plt.figure('bootstrap hist')
            plt.hist(frecD,normed=True, bins=Nbins)
            plt.hist(frecD,normed=True,histtype='step')

            plt.figure('bootstrap cumhist')
            Xcum=np.sort(D)
            Ycum=np.array(range(N_bootstraps))/float(N_bootstraps)
            plt.plot(Xcum,Ycum)
            #plt.xlim(Xcum,Xcum)
            plt.grid()
            plt.minorticks_on()
            plt.xlabel('')

        return D,omegaD
    else:
        return D
def LS_null_hypotesis(t, y, dy, omega, 
                           generalized=False, subtract_mean=False,
                           random_state=None, N_mock=1000,
                           hist=False, plot_hist=True,Nbins=200):
    """Use a monte carlo simulation to compute Lomb-Scargle null hypotesis periodogram
        shuffling the vector {t U y_obs}
    Parameters
    ----------
    The first set of parameters are passed to the lomb_scargle algorithm
    t : array_like
        sequence of times
    y : array_like
        sequence of observations
    dy : array_like
        sequence of observational errors
    omega : array_like
        frequencies at which to evaluate p(omega)
    generalized : bool
        if True (default) use generalized lomb-scargle method
        otherwise, use classic lomb-scargle.
    subtract_mean : bool
        if True (default) subtract the sample mean from the data before
        computing the periodogram.  Only referenced if generalized is False
    Remaining parameters control the bootstrap
    N_mock : int
        number of simulations
    random_state : None, int, or RandomState object
        random seed, or random number generator
    Returns
    -------
    D : ndarray
        distribution of the height of the highest peak
    if hist=True:
        omegaD : ndarray
            distribution of the angular frecuencies corresponding to D
    """
    #print(len(y))
    #print(len(dy))
    random_state = check_random_state(random_state)
    t = np.asarray(t)
    #dy = np.ones_like(y)
    y = np.asarray(y)
    dy = np.asarray(dy) + np.zeros_like(y)

    D = np.zeros(N_mock)
    omegaD= np.zeros(N_mock)

    mock_vector=np.append(y,t)
    mock_dy=dy.copy()
    
    for i in range(100):
        np.random.shuffle(mock_vector)
        print(mock_vector)
    for i in range(N_mock):
        #ind = random_state.randint(0, len(y), len(y))
        #y = np.random.normal(len(t))
        np.random.shuffle(mock_vector) 
        np.random.shuffle(mock_dy)   
        
        p = lomb_scargle(mock_vector[:len(t)], mock_vector[-len(t):], mock_dy, omega,\
                         generalized=generalized, subtract_mean=subtract_mean)
        D[i] = p.max()
        omegaD[i]=omega[p.argmax()]
        
        
    #max_PS_mock=np.max(PS_mock_all,axis=0)
    if hist:
        
        if plot_hist:
            from matplotlib import pyplot as plt
            frecD=omegaD.copy()/(2*np.pi)
            
            plt.figure('null hypothesis hist')
            plt.hist(frecD,normed=True, bins=Nbins)
            plt.hist(frecD,normed=True,histtype='step')

            plt.figure('null hypotesis cumhist')
            Xcum=np.sort(D)
            Ycum=np.array(range(N_mock))/float(N_mock)
            plt.plot(Xcum,Ycum)
            #plt.xlim(Xcum,Xcum)
            plt.grid()
            plt.minorticks_on()
            plt.xlabel('')

        return D,omegaD
    else:
        return D
Exemple #16
0
# and bisector fits, plot it, and print out its coefficients (slope and 
# intercept expressed assuming yy = slope*xx + intercept)



# use bootstrap resampling of the data to determine the 68% confidence
# intervals for the slope and intercept you computed above
# (hint: you cannot use the pre-made bootstrap/smoothedbootstrap functions
# but must construct a custom bootstrap as Ivezic Fig. 3.24 had to do)

nboot=100
Ndata=len(xx)
sloperesults = np.zeros(nboot)
intresults = np.zeros(nboot)
random_state=None
rng = check_random_state(random_state)
ind = rng.randint(Ndata, size=(Ndata,nboot))
#for k in range(nboot):



#slopesort=np.argsort(sloperesults)
#intsort=np.argsort(intresults)

#print "68%% confidence interval for slope: %0.2f -- %0.2f" % (,)
#print "68%% confidence interval for intercept: %0.2f -- %0.2f" % (,)

# now suppose we wish to determine whether a 2nd order model is
# superior to a 1st order one for this data set -- this problem is too
# hard for a 3-hour test if we allow for errors in both variables, so
# we'll assume from now on that all the scatter is in the yy direction
Exemple #17
0
def compute_CDF_bounds_by_MC(sample,
                             sample_errp,
                             sample_errm=None,
                             sample_ll=None,
                             sample_ul=None,
                             ll_max_val=None,
                             ul_min_val=None,
                             weights=None,
                             weights_err=None,
                             weqs=False,
                             confidence=95.0,
                             bins=None,
                             positive=False,
                             precision=1000,
                             precision_pdf=1000,
                             N_MC=1000,
                             bootstrap=False,
                             verbose=False,
                             random_state=None,
                             show_plot=False,
                             ax=None,
                             color='k',
                             show_median=True,
                             **kwargs):
    """
        Function to compute the lower and upper bounds of a cumulative distribution function for quantities with errors.
        Note if this function takes too long, try reducing N_MC or precision.
        Returns the middle point for the bins, the median CDF computed, the lower and upper CDF envelopes and the figure on which it has plotted.

        Parameters:
        -----------

        sample : [numpy array]
            Numpy array of the data for which to compute the bounds.

        sample_errp : [numpy array]
            Numpy array of the positive error on the data for which to compute the bounds.

        sample_errm : [numpy array]
            Default is None
            Numpy array of the negative error on the data for which to compute the bounds.
            If None, the errors are assumed to be symetric and equal to sample_errp

        sample_ll : [numpy array]
            Default is None
            Numpy array of the lower limits on the data for which to compute the bounds.
            Expects 0 if not a limit, 1 if a limit.

        sample_ul : [numpy array]
            Default is None
            Numpy array of the upper limits on the data for which to compute the bounds.
            Expects 0 if not a limit, 1 if a limit.

        weights : [numpy array]
            Default is None
            Numpy array of the weights on the data for which to compute the bounds.
            If None, the weights are assumed to be equal and no weighting is computed.

        weights_err : [numpy array or list of 2 numpy arrays]
            Default is None
            Numpy array of the error on the weights.
            If a numpy array is provided, the weights are drawn from an gaussian with sigma equal to weights.
            If a list of 2 numpy arrays is provided, the first numpy array is assumed to be the plus error and the second the minus.
            ex : [weights_errp, weights_errm]

        weqs : [bool]
            Default is False.
            If True, the weights are assumed equal to the sample and are not redrawn.
            Use this if you are weighting the sample by itself.

        confidence : [float]
            Default is 95.0
            Confidence level of the bounds (in percent).

        bins : [numpy array]
            Default is None
            Bins in which to compute the cumulative distribution function.
            If None, will use the conservative assumption that the smallest bin value is
            the min value of the sample minus 5 time the maximum error of the sample
            (max value plus 5 times max error for largest bin value).
            Use 'precision' argument to controle the number of bins in this case.

        positive : [bool]
            Default is False
            Set to True if the quantity can not be negative (a distance for example).

        bootstrap : [bool]
            Default is False
            Set to True to perform bootstrap resampling.

        ll_max_val : [numpy array]
            Default is None
            Numpy array of the minimum value to use as a lower bound for the prior on lower limits.
            If lower limits are provided (with sample_ll) but ll_max_val is not specified,
            by default will use the maximum value of the sample plus 5 times the largest error

        ul_min_val : [numpy array]
            Default is None
            Numpy array of the maximum value to use as an upper bound for the prior on upper limits.
            If upper limits are provided (with sample_ul) but ul_min_val is not specified,
            by default will use the minimum value of the sample minus 5 times the largest error

        precision : [int]
            Default is 1000
            Numbers of bins in which to compute the cumulative distribution function.

        precision_pdf : [int]
            Default is 1000
            Numbers of points with which to sample the probability distribution function of each data point.

        N_MC : [int]
            Default is 1000
            Numbers of Monte Carlo realizations of the sample.

        show_plot : [bool]
            Default is False
            If True, will create plots.

        ax : [matplotlib axes instance]
            Default is None
            If specified, will plot the median and upper and lower bounds on the ax.

        color : [str]
            Default is 'k'
            Color of the CDF to plot.

        **kwargs : [dict]
            Any matplotlib key words to pass to the median plot.

        Returns
        -------
        bins_mid : [array]
            Numpy array of values for the middle of the bins used in the cumulative distribution function

        median : [array]
            Numpy array of values for the median at each bin.

        lower : [array]
            Numpy array of values for the lower bound at each bin.

        upper : [array]
            Numpy array of values for the upper bound at each bin.

        fig : [matplotlib Figure]
            The figure on which is drawn the plot. Returns None if no show_plot is False.

    """
    if show_plot:
        import matplotlib.pyplot as plt
        from matplotlib.lines import Line2D
    # Check inputs
    if not isinstance(sample, np.ndarray):
        raise TypeError('sample must be numpy array.')

    if verbose:
        log.info("In compute_CDF_bounds_by_MC: initializing...")
    # If plot is demanded but no ax is given, create figure and plot also indivudal points' PDFs
    if show_plot and (ax is None):
        fig = plt.figure(figsize=(8, 5))
        ax1 = fig.add_subplot(211)
        ax2 = fig.add_subplot(212, sharex=ax1)
        fig.subplots_adjust(hspace=0)
        plt.setp(ax1.get_xticklabels(), visible=False)
        ax1.tick_params(axis='x', which='both', bottom=False)
    elif show_plot:
        ax1 = ax
        fig = plt.gcf()
    else:
        fig = None

    sample_len = len(sample)
    # If no negative error, assume errors are symmetric
    if sample_errm is None:
        sample_errm = sample_errp

    # Check if limits
    if sample_ll is None:
        sample_ll = np.zeros(sample_len)
    else:
        if len(sample_ll) != sample_len:
            raise IOError("sample_ll and sample must have same length")
        if ll_max_val is None:
            ll_max_val = (sample.max() +
                          5 * sample_errm.max()) * np.ones(sample_len)

    if sample_ul is None:
        sample_ul = np.zeros(sample_len)
    else:
        if len(sample_ul) != sample_len:
            raise IOError("sample_ul and sample must have same length")
        if ul_min_val is None:
            ul_min_val = (sample.min() -
                          5 * sample_errm.max()) * np.ones(sample_len)

    # Create percentiles:
    lower_percentile = (1. - confidence / 100.) / 2.
    upper_percentile = 1. - lower_percentile

    # Create bins
    if bins is None:
        bin_min = sample.min() - 5 * sample_errm.max()
        bin_max = sample.max() + 5 * sample_errp.max()
        bins = np.linspace(bin_min, bin_max, precision + 1)
    else:
        precision = len(bins) - 1
        bin_min = bins.min()
        bin_max = bins.max()

    # Create middle value of the bins, warning this has a length of bins - 1 (i.e. of length precision here)
    # This is essentially used for plotting purposes
    bins_mid = 0.5 * (bins[1:] + bins[:-1])

    # Create a array where each line is one MC realization drawings of the sample
    sample_real = np.zeros((N_MC, sample_len))

    # Create array for the weights
    if weights is not None:
        if (len(weights) != sample_len):
            raise IOError("weights and sample must have same length")
        # Create arrays for the realizations of the weights if they have errors
        if weights_err is not None:
            weights_real = np.zeros(sample_real.shape)
            # If list of length 2 provided, interpret as follows asymmetric errors
            if isinstance(weights_err, list) & len(weights_err) == 2:
                weights_errp = weights_err[0]
                weights_errm = weights_err[1]
            # Otherwise symmetric errors
            elif isinstance(weights_err, np.ndarray):
                weights_errp = weights_err
                weights_errm = weights_err
        # If no error on the weights
        else:
            weights_real = weights * np.ones(sample_real.shape)
    # If no weights, use weights = 1 for everything
    else:
        weights_real = np.ones(sample_real.shape)

    # Create array that will hold the PDF for each point in the sample
    sample_pdf = np.zeros((precision_pdf, sample_len))

    if verbose:
        log.info(
            "In compute_CDF_bounds_by_MC: starting Monte Carlo drawings...")
    # For every point in the sample:
    # - Generate N_MC realizations of its value following an asymmetric gaussian pdf with standard deviations being the errors on the point
    # - Calculate its PDF
    sample_real = MC_realization(sample,
                                 sample_errp,
                                 sample_errm=sample_errm,
                                 sample_ll=sample_ll,
                                 sample_ul=sample_ul,
                                 ll_max_val=ll_max_val,
                                 ul_min_val=ul_min_val,
                                 N_MC=N_MC,
                                 positive=positive)
    # - If there are weights with errors, generate N_MC realizations of the weight value following same procedure as above
    if weqs:
        weights_real = sample_real.copy()
    elif (weights_err is not None) and (weights is not None):
        for i in range(sample_len):
            weights_real[:, i] = asym_gaussian_draw(weights[i],
                                                    sigma1=weights_errm[i],
                                                    sigma2=weights_errp[i],
                                                    nb_draws=N_MC,
                                                    positive=True)

    # Visualize the PDFs of each individual point in the sample
    if show_plot and (ax is None):
        if verbose:
            log.info(
                "In compute_CDF_bounds_by_MC: starting drawings of individual point PDFs for plotting..."
            )
        for i in range(sample_len):
            if (sample_ll[i] == 0) & (sample_ul[i] == 0):
                x, sample_pdf[:,
                              i] = asym_gaussian_pdf(sample[i],
                                                     x_min=bin_min,
                                                     x_max=bin_max,
                                                     precision=len(sample_pdf),
                                                     sigma1=sample_errm[i],
                                                     sigma2=sample_errp[i],
                                                     nb_draws=N_MC,
                                                     positive=positive)
            # If limits, create flat prior over the desired range
            elif (sample_ll[i] == 1) & (sample_ul[i] == 0):
                x = np.linspace(bin_min, bin_max, len(sample_pdf))
                a = 1. / np.abs(sample[i] - ll_max_val[i])
                sample_pdf[:, i] = np.zeros(len(sample_pdf))
                sample_pdf[:, i][np.where((x >= sample[i])
                                          & (x <= ll_max_val[i]))] = a
            elif (sample_ll[i] == 0) & (sample_ul[i] == 1):
                x = np.linspace(bin_min, bin_max, len(sample_pdf))
                a = 1. / np.abs(ul_min_val[i] - sample[i])
                sample_pdf[:, i] = np.zeros(len(sample_pdf))
                sample_pdf[:, i][np.where((x <= sample[i])
                                          & (x >= ul_min_val[i]))] = a
                if positive:
                    sample_pdf[:, i][np.where(x <= 0)] = 0.0
            else:
                x = np.linspace(bin_min, bin_max, len(sample_pdf))
                a = 1. / np.abs(ul_min_val[i] - ll_max_val[i])
                sample_pdf[:, i] = np.zeros(len(sample_pdf))
                sample_pdf[:, i][np.where((x >= ul_min_val[i])
                                          & (x <= ll_max_val[i]))] = a
            ax2.plot(x, sample_pdf[:, i])

        # Create the total PDF and CDF from the individual PDFs of the points in the sample
        summed_pdf = np.sum(sample_pdf, axis=1)
        summed_pdf = summed_pdf / np.sum(summed_pdf * (x[1] - x[0]))
        ax2.plot(x, summed_pdf, c='k', lw=2, label='Summed PDF')
        ax2.legend()

    # Perform bootstrap
    if bootstrap:
        rng = check_random_state(random_state)
        ind = rng.randint(sample_len, size=(N_MC, sample_len))
        sample_bootstrapped = np.zeros(sample_real.shape)
        weights_bootstrapped = np.zeros(weights_real.shape)
        if verbose:
            log.info("In compute_CDF_bounds_by_MC: starting bootstraps...")
        for i in range(N_MC):
            sample_bootstrapped[i] = sample_real[i][ind[i]]
            weights_bootstrapped[i] = weights_real[i][ind[i]]
    else:
        sample_bootstrapped = sample_real
        weights_bootstrapped = weights_real

    if verbose:
        log.info("In compute_CDF_bounds_by_MC: computing CDF...")

    # Compute the PDF and CDF for each realization
    CDF_real = np.zeros((N_MC, precision))
    PDF_real = np.zeros((N_MC, precision))
    for i in range(N_MC):
        hist, _bins = np.histogram(sample_bootstrapped[i],
                                   bins=bins,
                                   weights=weights_bootstrapped[i])
        PDF_real[i, :] = hist / float(np.sum(hist) * (_bins[1] - _bins[0]))
        CDF_real[i, :] = np.cumsum(hist).astype(float) / float(np.sum(hist))

    # Compute the percentiles for each bin
    lower = np.zeros(precision)
    median = np.zeros(precision)
    upper = np.zeros(precision)
    for i in range(precision):
        q = mstats.mquantiles(CDF_real[:, i],
                              prob=[lower_percentile, 0.5, upper_percentile])
        median[i] = q[1]
        upper[i] = q[2]
        lower[i] = q[0]

    if show_plot:
        if verbose:
            log.info("In compute_CDF_bounds_by_MC: plotting...")
        if show_median:
            artist, = ax1.plot(bins_mid,
                               median,
                               drawstyle='steps-mid',
                               c=color,
                               **kwargs)
            # Add the first and last line to make the plot look better
            plot_color = plt.getp(artist, 'color')
            plot_ls = plt.getp(artist, 'linestyle')
            plot_lw = plt.getp(artist, 'linewidth')
            plot_zorder = plt.getp(artist, 'zorder')
            imin = np.where(median > 0)[0][0]
            imax = np.where(median == 1)[0][0]
            xmin = bins_mid[imin]
            xmax = bins_mid[imax]
            bottom_line = Line2D([xmin, xmin], [0, median[imin]],
                                 color=plot_color,
                                 linestyle=plot_ls,
                                 linewidth=plot_lw,
                                 zorder=plot_zorder)
            ax1.add_line(bottom_line)
            top_line = Line2D([xmax, xmax + (xmax - xmin)],
                              [median[imax], median[imax]],
                              color=plot_color,
                              linestyle=plot_ls,
                              linewidth=plot_lw,
                              zorder=plot_zorder)
            ax1.add_line(top_line)
        ax1.fill_between(bins_mid,
                         lower,
                         upper,
                         step='mid',
                         color=color,
                         alpha=0.3,
                         zorder=plot_zorder - 1)
        ax1.plot(bins_mid,
                 lower,
                 drawstyle='steps-mid',
                 lw=0.7,
                 c=color,
                 zorder=plot_zorder)
        ax1.plot(bins_mid,
                 upper,
                 drawstyle='steps-mid',
                 lw=0.7,
                 c=color,
                 zorder=plot_zorder)

    return bins_mid, median, lower, upper, fig
def LS_bootstrap_sig(t, y, dy, omega,
                           generalized=True, subtract_mean=True,
                           N_bootstraps=100, random_state=None,
                           hist=False, plot_hist=True,Nbins=200):
    """Use a bootstrap analysis to compute Lomb-Scargle significance
    Parameters
    ----------
    The first set of parameters are passed to the lomb_scargle algorithm
    t : array_like
        sequence of times
    y : array_like
        sequence of observations
    dy : array_like
        sequence of observational errors
    omega : array_like
        frequencies at which to evaluate p(omega)
    generalized : bool
        if True (default) use generalized lomb-scargle method
        otherwise, use classic lomb-scargle.
    subtract_mean : bool
        if True (default) subtract the sample mean from the data before
        computing the periodogram.  Only referenced if generalized is False
    Remaining parameters control the bootstrap
    N_bootstraps : int
        number of bootstraps
    random_state : None, int, or RandomState object
        random seed, or random number generator
    Returns
    -------
    D : ndarray
        distribution of the height of the highest peak
    """
    random_state = check_random_state(random_state)
    t = np.asarray(t)
    y = np.asarray(y)
    dy = np.asarray(dy) + np.zeros_like(y)

    D = np.zeros(N_bootstraps)
    omegaD= np.zeros(N_bootstraps)
    
    for i in range(N_bootstraps):
        ind = random_state.randint(0, len(y), len(y))
        #print[ind]
        p = lomb_scargle(t, y[ind], dy[ind], omega,
                         generalized=generalized, subtract_mean=subtract_mean)
        D[i] = p.max()
        omegaD[i]=omega[p.argmax()]
    if hist:
        
        if plot_hist:
            from matplotlib import pyplot as plt
            frecD=omegaD.copy()/(2*np.pi)
            
            plt.figure('bootstrap hist')
            plt.hist(frecD,normed=True, bins=Nbins)
            plt.hist(frecD,normed=True,histtype='step')

            plt.figure('bootstrap cumhist')
            Xcum=np.sort(D)
            Ycum=np.array(range(N_bootstraps))/float(N_bootstraps)
            plt.plot(Xcum,Ycum)
            #plt.xlim(Xcum,Xcum)
            plt.grid()
            plt.minorticks_on()
            plt.xlabel('')

        return D,omegaD
    else:
        return D