def period_search_ls(t, mag, magerr, remove_harmonics=True): ls = LombScargleFast(silence_warnings=True) T = np.max(t) - np.min(t) ls.optimizer.period_range = (0.01, T) ls.fit(t, mag, magerr) period = ls.best_period power = ls.periodogram(period) # https://github.com/astroML/gatspy/blob/master/examples/FastLombScargle.ipynb oversampling = 3.0 N = len(t) df = 1. / (oversampling * T) # frequency grid spacing fmin = 2 / T fmax = 480 # minimum period is 0.05 d Nf = (fmax - fmin) // df freqs = fmin + df * np.arange(Nf) periods = 1 / freqs powers = ls._score_frequency_grid(fmin, df, Nf) ind_best = np.argsort(powers)[-1] period = periods[ind_best] power = powers[ind_best] # calcualte false alarm probability (FAP) Z = power normalization = 'standard' # fap_Neff is underestimate fap_Neff = FAP_estimated(Z, N, fmax, t, normalization=normalization) """ # fap_Baluev is overestimate fap_Baluev = FAP_aliasfree(Z, N, fmax, t, mag, magerr, normalization=normalization) """ psigma = (np.percentile(powers, 84) - np.percentile(powers, 16)) / 2 significance = power / psigma if remove_harmonics == True: # In some cases, the period search is not successful: harmonics = np.array([1 / 5, 1 / 4, 1 / 3, 1 / 2, 1., 2.]) if abs(period - T) < 0.005: period = -99 else: for harmonic in harmonics: if abs(period - harmonic) < 0.005: if fap_Neff > 0.001: period = -99 data_out = {} data_out["period"] = period data_out["significance"] = significance data_out["freqs"] = freqs data_out["powers"] = powers data_out["power"] = power data_out["Nztfobs"] = N data_out["fap_Neff"] = fap_Neff # data_out["fap_Baluev"] = fap_Baluev return data_out
def find_cycle(feature, strain, mouse=None, bin_width=15, methods='LombScargleFast', disturb_t=False, gen_doc=False, plot=True, search_range_fit=None, nyquist_factor=3, n_cycle=10, search_range_find=(2, 26), sig=np.array([0.05])): """ Use Lomb-Scargel method on different strain and mouse's data to find the best possible periods with highest p-values. The function can be used on specific strains and specific mouses, as well as just specific strains without specifying mouse number. We use the O(NlogN) fast implementation of Lomb-Scargle from the gatspy package, and also provide a way to visualize the result. Note that either plotting or calculating L-S power doesn't use the same method in finding best cycle. The former can use user-specified search_range, while the latter uses default two grid search_range. Parameters ---------- feature: string in {"AS", "F", "M_AS", "M_IS", "W", "Distance"} "AS": Active state probalibity "F": Food consumed (g) "M_AS": Movement outside homebase "M_IS": Movement inside homebase "W": Water consumed (g) "Distance": Distance traveled strain: int nonnegative integer indicating the strain number mouse: int, default is None nonnegative integer indicating the mouse number bin_width: int, minute unit, default is 15 minutes number of minutes, the time interval for data aggregation methods: string in {"LombScargleFast", "LombScargle"} indicating the method used in determining periods and best cycle. If choose 'LombScargle', 'disturb_t' must be True. disturb_t: boolean, default is False If True, add uniformly distributed noise to the time sequence which are used to fit the Lomb Scargle model. This is to avoid the singular matrix error that could happen sometimes. plot: boolean, default is True If True, call the visualization function to plot the Lomb Scargle power versus periods plot. First use the data (either strain specific or strain-mouse specific) to fit the LS model, then use the search_range_fit as time sequence to predict the corresponding LS power, at last draw the plot out. There will also be stars and horizontal lines indicating the p-value of significance. Three stars will be p-value in [0,0.001], two stars will be p-value in [0.001,0.01], one star will be p-value in [0.01,0.05]. The horizontal line is the LS power that has p-value of 0.05. search_range_fit: list, numpy array or numpy arange, hours unit, default is None list of numbers as the time sequence to predict the corrsponding Lomb Scargle power. If plot is 'True', these will be drawn as the x-axis. Note that the number of search_range_fit points can not be too small, or the prediction smooth line will not be accurate. However the plot will always give the right periods and their LS power with 1,2 or 3 stars. This could be a sign to check whether search_range_fit is not enough to draw the correct plot. We recommend the default None, which is easy to use. nyquist_factor: int If search_range_fit is None, the algorithm will automatically choose the periods sequence. 5 * nyquist_factor * length(time sequence) / 2 gives the number of power and periods used to make LS prediction and plot the graph. n_cycle: int, default is 10 numbers of periods to be returned by function, which have the highest Lomb Scargle power and p-value. search_range_find: list, tuple or numpy array with length of 2, default is (2,26), hours unit Range of periods to be searched for best cycle. Note that the minimum should be strictly larger than 0 to avoid 1/0 issues. sig: list or numpy array, default is [0.05]. significance level to be used for plot horizontal line. gen_doc: boolean, default is False If true, return the parameters needed for visualize the LS power versus periods Returns ------- cycle: numpy array of length 'n_cycle' The best periods with highest LS power and p-values. cycle_power: numpy array of length 'n_cycle' The corrsponding LS power of 'cycle'. cycle_pvalue: numpy array of length 'n_cycle' The corrsponding p-value of 'cycle'. periods: numpy array of the same length with 'power' use as time sequence in LS model to make predictions.Only return when gen_doc is True. power: numpy array of the same length with 'periods' the corresponding predicted power of periods. Only return when gen_doc is True. sig: list, tuple or numpy array, default is [0.05]. significance level to be used for plot horizontal line. Only return when gen_doc is True. N: int the length of time sequence in the fit model. Only return when gen_doc is True. Examples ------- >>> a,b,c = find_cycle(feature='F', strain = 0,mouse = 0, plot=False,) >>> print(a,b,c) >>> [ 23.98055016 4.81080233 12.00693952 6.01216335 8.0356203 3.4316698 2.56303353 4.9294791 21.37925713 3.5697756 ] [ 0.11543449 0.05138839 0.03853218 0.02982237 0.02275952 0.0147941 0.01151601 0.00998443 0.00845883 0.0082382 ] [ 0.00000000e+00 3.29976046e-10 5.39367189e-07 8.10528027e-05 4.71001953e-03 3.70178834e-01 9.52707020e-01 9.99372657e-01 9.99999981e-01 9.99999998e-01] """ if feature not in ALL_FEATURES: raise ValueError( 'Input value must in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}') if methods not in METHOD: raise ValueError( 'Input value must in {"LombScargleFast","LombScargle"}') # get data if mouse is None: data_all = aggregate_data(feature=feature, bin_width=bin_width) n_mouse_in_strain = len( set(data_all.loc[data_all['strain'] == strain]['mouse'])) data = [[] for i in range(n_mouse_in_strain)] t = [[] for i in range(n_mouse_in_strain)] for i in range(n_mouse_in_strain): data[i] = data_all.loc[(data_all['strain'] == strain) & (data_all['mouse'] == i)][feature] t[i] = np.array( np.arange(0, len(data[i]) * bin_width / 60, bin_width / 60)) data = [val for sublist in data for val in sublist] N = len(data) t = [val for sublist in t for val in sublist] else: if feature == 'Distance': data = aggregate_movement(strain=strain, mouse=mouse, bin_width=bin_width) N = len(data) t = np.arange(0, N * bin_width / 60, bin_width / 60) else: data = aggregate_interval(strain=strain, mouse=mouse, feature=feature, bin_width=bin_width) N = len(data) t = np.arange(0, N * bin_width / 60, bin_width / 60) y = data # fit model if disturb_t is True: t = t + np.random.uniform(-bin_width / 600, bin_width / 600, N) if methods == 'LombScargleFast': model = LombScargleFast(fit_period=False).fit(t=t, y=y) elif methods == 'LombScargle': model = LombScargle(fit_period=False).fit(t=t, y=y) # calculate periods' LS power if search_range_fit is None: periods, power = model.periodogram_auto(nyquist_factor=nyquist_factor) else: periods = search_range_fit power = model.periodogram(periods=search_range_fit) # find best cycle model.optimizer.period_range = search_range_find cycle, cycle_power = model.find_best_periods(return_scores=True, n_periods=n_cycle) cycle_pvalue = 1 - (1 - np.exp(cycle_power / (-2) * (N - 1)))**(2 * N) # visualization if plot is True: lombscargle_visualize(periods=periods, power=power, sig=sig, N=N, cycle_power=cycle_power, cycle_pvalue=cycle_pvalue, cycle=cycle) if gen_doc is True: return periods, power, sig, N, cycle, cycle_power, cycle_pvalue return cycle, cycle_power, cycle_pvalue
period = periods[np.argmax(periodogram)] extra = '' ################################################################################ ################################################################################ # Conditional entropy (adaptive grid) ################################################################################ elif options.algorithm == 'ce-adaptive': from ce import ConditionalEntropy starttime = datetime.now() model = ConditionalEntropy(t, m, verbose = True) periods, periodogram = model.periodogram() period = model.best_periods[0] endtime = datetime.now() print 'Periods: ' + str(model.best_periods) print 'Scores: ' + str(model.best_scores) extra = '' ################################################################################ ################################################################################ # Conditional entropy ################################################################################ elif options.algorithm == 'ce':
period = periods[np.argmax(periodogram)] extra = '' ################################################################################ ################################################################################ # Conditional entropy (adaptive grid) ################################################################################ elif options.algorithm == 'ce-adaptive': from ce import ConditionalEntropy starttime = datetime.now() model = ConditionalEntropy(t, m, verbose=True) periods, periodogram = model.periodogram() period = model.best_periods[0] endtime = datetime.now() print 'Periods: ' + str(model.best_periods) print 'Scores: ' + str(model.best_scores) extra = '' ################################################################################ ################################################################################ # Conditional entropy ################################################################################ elif options.algorithm == 'ce':
def find_cycle(feature, strain, mouse=None, bin_width=15, methods='LombScargleFast', disturb_t=False, gen_doc=False, plot=True, search_range_fit=None, nyquist_factor=3, n_cycle=10, search_range_find=(2, 26), sig=np.array([0.05])): """ Use Lomb-Scargel method on different strain and mouse's data to find the best possible periods with highest p-values. The function can be used on specific strains and specific mouses, as well as just specific strains without specifying mouse number. We use the O(NlogN) fast implementation of Lomb-Scargle from the gatspy package, and also provide a way to visualize the result. Note that either plotting or calculating L-S power doesn't use the same method in finding best cycle. The former can use user-specified search_range, while the latter uses default two grid search_range. Parameters ---------- feature: string in {"AS", "F", "M_AS", "M_IS", "W", "Distance"} "AS": Active state probalibity "F": Food consumed (g) "M_AS": Movement outside homebase "M_IS": Movement inside homebase "W": Water consumed (g) "Distance": Distance traveled strain: int nonnegative integer indicating the strain number mouse: int, default is None nonnegative integer indicating the mouse number bin_width: int, minute unit, default is 15 minutes number of minutes, the time interval for data aggregation methods: string in {"LombScargleFast", "LombScargle"} indicating the method used in determining periods and best cycle. If choose 'LombScargle', 'disturb_t' must be True. disturb_t: boolean, default is False If True, add uniformly distributed noise to the time sequence which are used to fit the Lomb Scargle model. This is to avoid the singular matrix error that could happen sometimes. plot: boolean, default is True If True, call the visualization function to plot the Lomb Scargle power versus periods plot. First use the data (either strain specific or strain-mouse specific) to fit the LS model, then use the search_range_fit as time sequence to predict the corresponding LS power, at last draw the plot out. There will also be stars and horizontal lines indicating the p-value of significance. Three stars will be p-value in [0,0.001], two stars will be p-value in [0.001,0.01], one star will be p-value in [0.01,0.05]. The horizontal line is the LS power that has p-value of 0.05. search_range_fit: list, numpy array or numpy arange, hours unit, default is None list of numbers as the time sequence to predict the corrsponding Lomb Scargle power. If plot is 'True', these will be drawn as the x-axis. Note that the number of search_range_fit points can not be too small, or the prediction smooth line will not be accurate. However the plot will always give the right periods and their LS power with 1,2 or 3 stars. This could be a sign to check whether search_range_fit is not enough to draw the correct plot. We recommend the default None, which is easy to use. nyquist_factor: int If search_range_fit is None, the algorithm will automatically choose the periods sequence. 5 * nyquist_factor * length(time sequence) / 2 gives the number of power and periods used to make LS prediction and plot the graph. n_cycle: int, default is 10 numbers of periods to be returned by function, which have the highest Lomb Scargle power and p-value. search_range_find: list, tuple or numpy array with length of 2, default is (2,26), hours unit Range of periods to be searched for best cycle. Note that the minimum should be strictly larger than 0 to avoid 1/0 issues. sig: list or numpy array, default is [0.05]. significance level to be used for plot horizontal line. gen_doc: boolean, default is False If true, return the parameters needed for visualize the LS power versus periods Returns ------- cycle: numpy array of length 'n_cycle' The best periods with highest LS power and p-values. cycle_power: numpy array of length 'n_cycle' The corrsponding LS power of 'cycle'. cycle_pvalue: numpy array of length 'n_cycle' The corrsponding p-value of 'cycle'. periods: numpy array of the same length with 'power' use as time sequence in LS model to make predictions.Only return when gen_doc is True. power: numpy array of the same length with 'periods' the corresponding predicted power of periods. Only return when gen_doc is True. sig: list, tuple or numpy array, default is [0.05]. significance level to be used for plot horizontal line. Only return when gen_doc is True. N: int the length of time sequence in the fit model. Only return when gen_doc is True. Examples ------- >>> a,b,c = find_cycle(feature='F', strain = 0,mouse = 0, plot=False,) >>> print(a,b,c) >>> [ 23.98055016 4.81080233 12.00693952 6.01216335 8.0356203 3.4316698 2.56303353 4.9294791 21.37925713 3.5697756 ] [ 0.11543449 0.05138839 0.03853218 0.02982237 0.02275952 0.0147941 0.01151601 0.00998443 0.00845883 0.0082382 ] [ 0.00000000e+00 3.29976046e-10 5.39367189e-07 8.10528027e-05 4.71001953e-03 3.70178834e-01 9.52707020e-01 9.99372657e-01 9.99999981e-01 9.99999998e-01] """ if feature not in ALL_FEATURES: raise ValueError( 'Input value must in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}') if methods not in METHOD: raise ValueError( 'Input value must in {"LombScargleFast","LombScargle"}') # get data if mouse is None: data_all = aggregate_data(feature=feature, bin_width=bin_width) n_mouse_in_strain = len( set(data_all.loc[data_all['strain'] == strain]['mouse'])) data = [[] for i in range(n_mouse_in_strain)] t = [[] for i in range(n_mouse_in_strain)] for i in range(n_mouse_in_strain): data[i] = data_all.loc[(data_all['strain'] == strain) & ( data_all['mouse'] == i)][feature] t[i] = np.array(np.arange(0, len(data[i]) * bin_width / 60, bin_width / 60)) data = [val for sublist in data for val in sublist] N = len(data) t = [val for sublist in t for val in sublist] else: if feature == 'Distance': data = aggregate_movement( strain=strain, mouse=mouse, bin_width=bin_width) N = len(data) t = np.arange(0, N * bin_width / 60, bin_width / 60) else: data = aggregate_interval( strain=strain, mouse=mouse, feature=feature, bin_width=bin_width) N = len(data) t = np.arange(0, N * bin_width / 60, bin_width / 60) y = data # fit model if disturb_t is True: t = t + np.random.uniform(-bin_width / 600, bin_width / 600, N) if methods == 'LombScargleFast': model = LombScargleFast(fit_period=False).fit(t=t, y=y) elif methods == 'LombScargle': model = LombScargle(fit_period=False).fit(t=t, y=y) # calculate periods' LS power if search_range_fit is None: periods, power = model.periodogram_auto(nyquist_factor=nyquist_factor) else: periods = search_range_fit power = model.periodogram(periods=search_range_fit) # find best cycle model.optimizer.period_range = search_range_find cycle, cycle_power = model.find_best_periods( return_scores=True, n_periods=n_cycle) cycle_pvalue = 1 - (1 - np.exp(cycle_power / (-2) * (N - 1))) ** (2 * N) # visualization if plot is True: lombscargle_visualize(periods=periods, power=power, sig=sig, N=N, cycle_power=cycle_power, cycle_pvalue=cycle_pvalue, cycle=cycle) if gen_doc is True: return periods, power, sig, N, cycle, cycle_power, cycle_pvalue return cycle, cycle_power, cycle_pvalue