Ejemplo n.º 1
0
def period_search_ls(t, mag, magerr, remove_harmonics=True):
    ls = LombScargleFast(silence_warnings=True)
    T = np.max(t) - np.min(t)
    ls.optimizer.period_range = (0.01, T)
    ls.fit(t, mag, magerr)
    period = ls.best_period
    power = ls.periodogram(period)

    # https://github.com/astroML/gatspy/blob/master/examples/FastLombScargle.ipynb
    oversampling = 3.0
    N = len(t)
    df = 1. / (oversampling * T)  # frequency grid spacing
    fmin = 2 / T
    fmax = 480  # minimum period is 0.05 d
    Nf = (fmax - fmin) // df
    freqs = fmin + df * np.arange(Nf)
    periods = 1 / freqs
    powers = ls._score_frequency_grid(fmin, df, Nf)
    ind_best = np.argsort(powers)[-1]
    period = periods[ind_best]
    power = powers[ind_best]

    # calcualte false alarm probability (FAP)
    Z = power
    normalization = 'standard'
    # fap_Neff is underestimate
    fap_Neff = FAP_estimated(Z, N, fmax, t, normalization=normalization)
    """
    # fap_Baluev is overestimate
    fap_Baluev = FAP_aliasfree(Z, N, fmax, t, mag, magerr, normalization=normalization)
    """
    psigma = (np.percentile(powers, 84) - np.percentile(powers, 16)) / 2
    significance = power / psigma

    if remove_harmonics == True:
        # In some cases, the period search is not successful:
        harmonics = np.array([1 / 5, 1 / 4, 1 / 3, 1 / 2, 1., 2.])
        if abs(period - T) < 0.005:
            period = -99
        else:
            for harmonic in harmonics:
                if abs(period - harmonic) < 0.005:
                    if fap_Neff > 0.001:
                        period = -99

    data_out = {}
    data_out["period"] = period
    data_out["significance"] = significance
    data_out["freqs"] = freqs
    data_out["powers"] = powers
    data_out["power"] = power
    data_out["Nztfobs"] = N
    data_out["fap_Neff"] = fap_Neff
    # data_out["fap_Baluev"] = fap_Baluev
    return data_out
Ejemplo n.º 2
0
def find_cycle(feature,
               strain,
               mouse=None,
               bin_width=15,
               methods='LombScargleFast',
               disturb_t=False,
               gen_doc=False,
               plot=True,
               search_range_fit=None,
               nyquist_factor=3,
               n_cycle=10,
               search_range_find=(2, 26),
               sig=np.array([0.05])):
    """
    Use Lomb-Scargel method on different strain and mouse's data to find the
    best possible periods with highest p-values. The function can be used on
    specific strains and specific mouses, as well as just specific strains
    without specifying mouse number. We use the O(NlogN) fast implementation
    of Lomb-Scargle from the gatspy package, and also provide a way to
    visualize the result.

    Note that either plotting or calculating L-S power doesn't use the same
    method in finding best cycle. The former can use user-specified
    search_range, while the latter uses default two grid search_range.

    Parameters
    ----------
    feature: string in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}
        "AS": Active state probalibity
        "F": Food consumed (g)
        "M_AS": Movement outside homebase
        "M_IS": Movement inside homebase
        "W": Water consumed (g)
        "Distance": Distance traveled
    strain: int
        nonnegative integer indicating the strain number
    mouse: int, default is None
        nonnegative integer indicating the mouse number
    bin_width: int, minute unit, default is 15 minutes
        number of minutes, the time interval for data aggregation
    methods: string in {"LombScargleFast", "LombScargle"}
        indicating the method used in determining periods and best cycle.
        If choose 'LombScargle', 'disturb_t' must be True.
    disturb_t: boolean, default is False
        If True, add uniformly distributed noise to the time sequence which
        are used to fit the Lomb Scargle model. This is to avoid the singular
        matrix error that could happen sometimes.
    plot: boolean, default is True
        If True, call the visualization function to plot the Lomb Scargle
        power versus periods plot. First use the data (either strain specific
        or strain-mouse specific) to fit the LS model, then use the
        search_range_fit as time sequence to predict the corresponding LS
        power, at last draw the plot out. There will also be stars and
        horizontal lines indicating the p-value of significance. Three stars
        will be p-value in [0,0.001], two stars will be p-value in
        [0.001,0.01], one star will be p-value in [0.01,0.05]. The horizontal
        line is the LS power that has p-value of 0.05.
    search_range_fit: list, numpy array or numpy arange, hours unit,
        default is None
        list of numbers as the time sequence to predict the corrsponding
        Lomb Scargle power. If plot is 'True', these will be drawn as the
        x-axis. Note that the number of search_range_fit points can not be
        too small, or the prediction smooth line will not be accurate.
        However the plot will always give the right periods and their LS
        power with 1,2 or 3 stars. This could be a sign to check whether
        search_range_fit is not enough to draw the correct plot.
        We recommend the default None, which is easy to use.
    nyquist_factor: int
        If search_range_fit is None, the algorithm will automatically
        choose the periods sequence.
        5 * nyquist_factor * length(time sequence) / 2 gives the number of
        power and periods used to make LS prediction and plot the graph.
    n_cycle: int, default is 10
        numbers of periods to be returned by function, which have the highest
        Lomb Scargle power and p-value.
    search_range_find: list, tuple or numpy array with length of 2, default is
                       (2,26), hours unit
        Range of periods to be searched for best cycle. Note that the minimum
        should be strictly larger than 0 to avoid 1/0 issues.
    sig: list or numpy array, default is [0.05].
        significance level to be used for plot horizontal line.
    gen_doc: boolean, default is False
        If true, return the parameters needed for visualize the LS power versus
        periods

    Returns
    -------
    cycle: numpy array of length 'n_cycle'
         The best periods with highest LS power and p-values.
    cycle_power: numpy array of length 'n_cycle'
         The corrsponding LS power of 'cycle'.
    cycle_pvalue: numpy array of length 'n_cycle'
         The corrsponding p-value of 'cycle'.
    periods: numpy array of the same length with 'power'
        use as time sequence in LS model to make predictions.Only return when
        gen_doc is True.
    power: numpy array of the same length with 'periods'
        the corresponding predicted power of periods. Only return when
        gen_doc is True.
    sig: list, tuple or numpy array, default is [0.05].
        significance level to be used for plot horizontal line.
        Only return when gen_doc is True.
    N: int
        the length of time sequence in the fit model. Only return when
        gen_doc is True.

    Examples
    -------
    >>> a,b,c = find_cycle(feature='F', strain = 0,mouse = 0, plot=False,)
    >>> print(a,b,c)
    >>> [ 23.98055016   4.81080233  12.00693952   6.01216335   8.0356203
         3.4316698    2.56303353   4.9294791   21.37925713   3.5697756 ]
        [ 0.11543449  0.05138839  0.03853218  0.02982237  0.02275952
        0.0147941  0.01151601  0.00998443  0.00845883  0.0082382 ]
        [  0.00000000e+00   3.29976046e-10   5.39367189e-07   8.10528027e-05
          4.71001953e-03   3.70178834e-01   9.52707020e-01   9.99372657e-01
         9.99999981e-01   9.99999998e-01]

    """
    if feature not in ALL_FEATURES:
        raise ValueError(
            'Input value must in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}')
    if methods not in METHOD:
        raise ValueError(
            'Input value must in {"LombScargleFast","LombScargle"}')

    # get data
    if mouse is None:
        data_all = aggregate_data(feature=feature, bin_width=bin_width)
        n_mouse_in_strain = len(
            set(data_all.loc[data_all['strain'] == strain]['mouse']))
        data = [[] for i in range(n_mouse_in_strain)]
        t = [[] for i in range(n_mouse_in_strain)]
        for i in range(n_mouse_in_strain):
            data[i] = data_all.loc[(data_all['strain'] == strain)
                                   & (data_all['mouse'] == i)][feature]
            t[i] = np.array(
                np.arange(0,
                          len(data[i]) * bin_width / 60, bin_width / 60))

        data = [val for sublist in data for val in sublist]
        N = len(data)
        t = [val for sublist in t for val in sublist]
    else:
        if feature == 'Distance':
            data = aggregate_movement(strain=strain,
                                      mouse=mouse,
                                      bin_width=bin_width)
            N = len(data)
            t = np.arange(0, N * bin_width / 60, bin_width / 60)
        else:
            data = aggregate_interval(strain=strain,
                                      mouse=mouse,
                                      feature=feature,
                                      bin_width=bin_width)
            N = len(data)
            t = np.arange(0, N * bin_width / 60, bin_width / 60)

    y = data

    # fit model
    if disturb_t is True:
        t = t + np.random.uniform(-bin_width / 600, bin_width / 600, N)

    if methods == 'LombScargleFast':
        model = LombScargleFast(fit_period=False).fit(t=t, y=y)
    elif methods == 'LombScargle':
        model = LombScargle(fit_period=False).fit(t=t, y=y)

    # calculate periods' LS power
    if search_range_fit is None:
        periods, power = model.periodogram_auto(nyquist_factor=nyquist_factor)
    else:
        periods = search_range_fit
        power = model.periodogram(periods=search_range_fit)

    # find best cycle
    model.optimizer.period_range = search_range_find
    cycle, cycle_power = model.find_best_periods(return_scores=True,
                                                 n_periods=n_cycle)
    cycle_pvalue = 1 - (1 - np.exp(cycle_power / (-2) * (N - 1)))**(2 * N)

    # visualization
    if plot is True:
        lombscargle_visualize(periods=periods,
                              power=power,
                              sig=sig,
                              N=N,
                              cycle_power=cycle_power,
                              cycle_pvalue=cycle_pvalue,
                              cycle=cycle)

    if gen_doc is True:
        return periods, power, sig, N, cycle, cycle_power, cycle_pvalue

    return cycle, cycle_power, cycle_pvalue
		period = periods[np.argmax(periodogram)]
		extra = ''

	################################################################################

	################################################################################
	# Conditional entropy (adaptive grid)
	################################################################################

	elif options.algorithm == 'ce-adaptive':

		from ce import ConditionalEntropy

		starttime = datetime.now()
		model = ConditionalEntropy(t, m, verbose = True)
		periods, periodogram = model.periodogram()
		period = model.best_periods[0]
		endtime = datetime.now()

		print 'Periods: ' + str(model.best_periods)
		print 'Scores: ' + str(model.best_scores)

		extra = ''

	################################################################################

	################################################################################
	# Conditional entropy
	################################################################################

	elif options.algorithm == 'ce':
Ejemplo n.º 4
0
        period = periods[np.argmax(periodogram)]
        extra = ''

    ################################################################################

    ################################################################################
    # Conditional entropy (adaptive grid)
    ################################################################################

    elif options.algorithm == 'ce-adaptive':

        from ce import ConditionalEntropy

        starttime = datetime.now()
        model = ConditionalEntropy(t, m, verbose=True)
        periods, periodogram = model.periodogram()
        period = model.best_periods[0]
        endtime = datetime.now()

        print 'Periods: ' + str(model.best_periods)
        print 'Scores: ' + str(model.best_scores)

        extra = ''

    ################################################################################

    ################################################################################
    # Conditional entropy
    ################################################################################

    elif options.algorithm == 'ce':
Ejemplo n.º 5
0
def find_cycle(feature, strain, mouse=None, bin_width=15,
               methods='LombScargleFast', disturb_t=False, gen_doc=False,
               plot=True, search_range_fit=None, nyquist_factor=3,
               n_cycle=10, search_range_find=(2, 26), sig=np.array([0.05])):
    """
    Use Lomb-Scargel method on different strain and mouse's data to find the
    best possible periods with highest p-values. The function can be used on
    specific strains and specific mouses, as well as just specific strains
    without specifying mouse number. We use the O(NlogN) fast implementation
    of Lomb-Scargle from the gatspy package, and also provide a way to
    visualize the result.

    Note that either plotting or calculating L-S power doesn't use the same
    method in finding best cycle. The former can use user-specified
    search_range, while the latter uses default two grid search_range.

    Parameters
    ----------
    feature: string in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}
        "AS": Active state probalibity
        "F": Food consumed (g)
        "M_AS": Movement outside homebase
        "M_IS": Movement inside homebase
        "W": Water consumed (g)
        "Distance": Distance traveled
    strain: int
        nonnegative integer indicating the strain number
    mouse: int, default is None
        nonnegative integer indicating the mouse number
    bin_width: int, minute unit, default is 15 minutes
        number of minutes, the time interval for data aggregation
    methods: string in {"LombScargleFast", "LombScargle"}
        indicating the method used in determining periods and best cycle.
        If choose 'LombScargle', 'disturb_t' must be True.
    disturb_t: boolean, default is False
        If True, add uniformly distributed noise to the time sequence which
        are used to fit the Lomb Scargle model. This is to avoid the singular
        matrix error that could happen sometimes.
    plot: boolean, default is True
        If True, call the visualization function to plot the Lomb Scargle
        power versus periods plot. First use the data (either strain specific
        or strain-mouse specific) to fit the LS model, then use the
        search_range_fit as time sequence to predict the corresponding LS
        power, at last draw the plot out. There will also be stars and
        horizontal lines indicating the p-value of significance. Three stars
        will be p-value in [0,0.001], two stars will be p-value in
        [0.001,0.01], one star will be p-value in [0.01,0.05]. The horizontal
        line is the LS power that has p-value of 0.05.
    search_range_fit: list, numpy array or numpy arange, hours unit,
        default is None
        list of numbers as the time sequence to predict the corrsponding
        Lomb Scargle power. If plot is 'True', these will be drawn as the
        x-axis. Note that the number of search_range_fit points can not be
        too small, or the prediction smooth line will not be accurate.
        However the plot will always give the right periods and their LS
        power with 1,2 or 3 stars. This could be a sign to check whether
        search_range_fit is not enough to draw the correct plot.
        We recommend the default None, which is easy to use.
    nyquist_factor: int
        If search_range_fit is None, the algorithm will automatically
        choose the periods sequence.
        5 * nyquist_factor * length(time sequence) / 2 gives the number of
        power and periods used to make LS prediction and plot the graph.
    n_cycle: int, default is 10
        numbers of periods to be returned by function, which have the highest
        Lomb Scargle power and p-value.
    search_range_find: list, tuple or numpy array with length of 2, default is
                       (2,26), hours unit
        Range of periods to be searched for best cycle. Note that the minimum
        should be strictly larger than 0 to avoid 1/0 issues.
    sig: list or numpy array, default is [0.05].
        significance level to be used for plot horizontal line.
    gen_doc: boolean, default is False
        If true, return the parameters needed for visualize the LS power versus
        periods

    Returns
    -------
    cycle: numpy array of length 'n_cycle'
         The best periods with highest LS power and p-values.
    cycle_power: numpy array of length 'n_cycle'
         The corrsponding LS power of 'cycle'.
    cycle_pvalue: numpy array of length 'n_cycle'
         The corrsponding p-value of 'cycle'.
    periods: numpy array of the same length with 'power'
        use as time sequence in LS model to make predictions.Only return when
        gen_doc is True.
    power: numpy array of the same length with 'periods'
        the corresponding predicted power of periods. Only return when
        gen_doc is True.
    sig: list, tuple or numpy array, default is [0.05].
        significance level to be used for plot horizontal line.
        Only return when gen_doc is True.
    N: int
        the length of time sequence in the fit model. Only return when
        gen_doc is True.

    Examples
    -------
    >>> a,b,c = find_cycle(feature='F', strain = 0,mouse = 0, plot=False,)
    >>> print(a,b,c)
    >>> [ 23.98055016   4.81080233  12.00693952   6.01216335   8.0356203
         3.4316698    2.56303353   4.9294791   21.37925713   3.5697756 ]
        [ 0.11543449  0.05138839  0.03853218  0.02982237  0.02275952
        0.0147941  0.01151601  0.00998443  0.00845883  0.0082382 ]
        [  0.00000000e+00   3.29976046e-10   5.39367189e-07   8.10528027e-05
          4.71001953e-03   3.70178834e-01   9.52707020e-01   9.99372657e-01
         9.99999981e-01   9.99999998e-01]

    """
    if feature not in ALL_FEATURES:
        raise ValueError(
            'Input value must in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}')
    if methods not in METHOD:
        raise ValueError(
            'Input value must in {"LombScargleFast","LombScargle"}')

    # get data
    if mouse is None:
        data_all = aggregate_data(feature=feature, bin_width=bin_width)
        n_mouse_in_strain = len(
            set(data_all.loc[data_all['strain'] == strain]['mouse']))
        data = [[] for i in range(n_mouse_in_strain)]
        t = [[] for i in range(n_mouse_in_strain)]
        for i in range(n_mouse_in_strain):
            data[i] = data_all.loc[(data_all['strain'] == strain) & (
                data_all['mouse'] == i)][feature]
            t[i] = np.array(np.arange(0, len(data[i]) *
                                      bin_width / 60, bin_width / 60))

        data = [val for sublist in data for val in sublist]
        N = len(data)
        t = [val for sublist in t for val in sublist]
    else:
        if feature == 'Distance':
            data = aggregate_movement(
                strain=strain, mouse=mouse, bin_width=bin_width)
            N = len(data)
            t = np.arange(0, N * bin_width / 60, bin_width / 60)
        else:
            data = aggregate_interval(
                strain=strain, mouse=mouse,
                feature=feature, bin_width=bin_width)
            N = len(data)
            t = np.arange(0, N * bin_width / 60, bin_width / 60)

    y = data

    # fit model
    if disturb_t is True:
        t = t + np.random.uniform(-bin_width / 600, bin_width / 600, N)

    if methods == 'LombScargleFast':
        model = LombScargleFast(fit_period=False).fit(t=t, y=y)
    elif methods == 'LombScargle':
        model = LombScargle(fit_period=False).fit(t=t, y=y)

    # calculate periods' LS power
    if search_range_fit is None:
        periods, power = model.periodogram_auto(nyquist_factor=nyquist_factor)
    else:
        periods = search_range_fit
        power = model.periodogram(periods=search_range_fit)

    # find best cycle
    model.optimizer.period_range = search_range_find
    cycle, cycle_power = model.find_best_periods(
        return_scores=True, n_periods=n_cycle)
    cycle_pvalue = 1 - (1 - np.exp(cycle_power / (-2) * (N - 1))) ** (2 * N)

    # visualization
    if plot is True:
        lombscargle_visualize(periods=periods, power=power, sig=sig, N=N,
                              cycle_power=cycle_power,
                              cycle_pvalue=cycle_pvalue, cycle=cycle)

    if gen_doc is True:
        return periods, power, sig, N, cycle, cycle_power, cycle_pvalue

    return cycle, cycle_power, cycle_pvalue