def test_lomb_scargle_regular_single_freq(): """ Test Lomb-Scargle model features on regularly-sampled periodic data with one frequency/multiple harmonics. Estimated parameters should be very accurate in this case. """ frequencies = np.hstack((WAVE_FREQS[0], np.zeros(len(WAVE_FREQS)-1))) amplitudes = np.zeros((len(frequencies),4)) amplitudes[0,:] = [8,4,2,1] phase = 0.1 times, values, errors = regular_periodic(frequencies, amplitudes, phase) all_lomb = sft.generate_science_features(times, values, errors, lomb_features) # Only test the first (true) frequency; the rest correspond to noise npt.assert_allclose(all_lomb['freq1_freq'], frequencies[0]) # Hard-coded value from previous solution npt.assert_allclose(0.001996007984, all_lomb['freq1_lambda'], rtol=1e-7) for (i,j), amplitude in np.ndenumerate(amplitudes): npt.assert_allclose(amplitude, all_lomb['freq{}_amplitude{}'.format(i+1,j+1)], rtol=1e-2, atol=1e-2) # Only test the first (true) frequency; the rest correspond to noise for j in range(1, NUM_HARMONICS): npt.assert_allclose(phase*j*(-1**j), all_lomb['freq1_rel_phase{}'.format(j+1)], rtol=1e-2, atol=1e-2) # Frequency ratio not relevant since there is only; only test amplitude/signif for i in [2,3]: npt.assert_allclose(0., all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=1e-3) npt.assert_array_less(10., all_lomb['freq1_signif']) # Only one frequency, so this should explain basically all the variance npt.assert_allclose(0., all_lomb['freq_varrat'], atol=5e-3) # Exactly periodic, so the same minima/maxima should reoccur npt.assert_allclose(0., all_lomb['freq_model_max_delta_mags'], atol=1e-6) npt.assert_allclose(0., all_lomb['freq_model_min_delta_mags'], atol=1e-6) # Linear trend should be zero since the signal is exactly sinusoidal npt.assert_allclose(0., all_lomb['linear_trend'], atol=1e-4) folded_times = times % 1./(frequencies[0]/2.) sort_indices = np.argsort(folded_times) folded_times = folded_times[sort_indices] folded_values = values[sort_indices] # Residuals from doubling period should be much higher npt.assert_array_less(10., all_lomb['medperc90_2p_p']) # Slopes should be the same for {un,}folded data; use unfolded for stability slopes = np.diff(values) / np.diff(times) npt.assert_allclose(np.percentile(slopes,10), all_lomb['fold2P_slope_10percentile'], rtol=1e-2) npt.assert_allclose(np.percentile(slopes,90), all_lomb['fold2P_slope_90percentile'], rtol=1e-2)
def test_lomb_scargle_irregular_multi_freq(): """ Test Lomb-Scargle model features on irregularly-sampled periodic data with multiple frequencies, each with a single harmonic. More difficult than regularly-sampled case, so we allow parameter estimates to be slightly noisy. """ frequencies = WAVE_FREQS amplitudes = np.zeros((len(frequencies),4)) amplitudes[:,0] = [4,2,1] phase = 0.1 times, values, errors = irregular_periodic(frequencies, amplitudes, phase) all_lomb = sft.generate_science_features(times, values, errors, lomb_features) for i, frequency in enumerate(frequencies): npt.assert_allclose(frequency, all_lomb['freq{}_freq'.format(i+1)], rtol=1e-2) for (i,j), amplitude in np.ndenumerate(amplitudes): npt.assert_allclose(amplitude, all_lomb['freq{}_amplitude{}'.format(i+1,j+1)], rtol=1e-1, atol=1e-1) for i in [2,3]: npt.assert_allclose(amplitudes[i-1,0] / amplitudes[0,0], all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=2e-2) npt.assert_allclose(frequencies[i-1] / frequencies[0], all_lomb['freq_frequency_ratio_{}1'.format(i)], atol=5e-2) npt.assert_array_less(10., all_lomb['freq1_signif'])
def test_lomb_scargle_regular_multi_freq(): """ Test Lomb-Scargle model features on regularly-sampled periodic data with multiple frequencies, each with a single harmonic. Estimated parameters should be very accurate in this case. """ frequencies = WAVE_FREQS amplitudes = np.zeros((len(frequencies),4)) amplitudes[:,0] = [4,2,1] phase = 0.1 times, values, errors = regular_periodic(frequencies, amplitudes, phase) all_lomb = sft.generate_science_features(times, values, errors, lomb_features) for i, frequency in enumerate(frequencies): npt.assert_allclose(frequency, all_lomb['freq{}_freq'.format(i+1)]) for (i,j), amplitude in np.ndenumerate(amplitudes): npt.assert_allclose(amplitude, all_lomb['freq{}_amplitude{}'.format(i+1,j+1)], rtol=5e-2, atol=5e-2) for i in [2,3]: npt.assert_allclose(amplitudes[i-1,0] / amplitudes[0,0], all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=2e-2) npt.assert_array_less(10., all_lomb['freq1_signif'])
def test_lomb_scargle_irregular_single_freq(): """ Test Lomb-Scargle model features on irregularly-sampled periodic data with one frequency/multiple harmonics. More difficult than regularly-sampled case, so we allow parameter estimates to be slightly noisy. """ frequencies = np.hstack((WAVE_FREQS[0], np.zeros(len(WAVE_FREQS)-1))) amplitudes = np.zeros((len(WAVE_FREQS),4)) amplitudes[0,:] = [8,4,2,1] phase = 0.1 times, values, errors = irregular_periodic(frequencies, amplitudes, phase) all_lomb = sft.generate_science_features(times, values, errors, lomb_features) # Only test the first (true) frequency; the rest correspond to noise npt.assert_allclose(all_lomb['freq1_freq'], frequencies[0], rtol=1e-2) # Only test first frequency here; noise gives non-zero amplitudes for residuals for j in range(NUM_HARMONICS): npt.assert_allclose(amplitudes[0,j], all_lomb['freq1_amplitude{}'.format(j+1)], rtol=5e-2, atol=5e-2) if j >= 1: npt.assert_allclose(phase*j*(-1**j), all_lomb['freq1_rel_phase{}'.format(j+1)], rtol=1e-1, atol=1e-1) npt.assert_array_less(10., all_lomb['freq1_signif']) # Only one frequency, so this should explain basically all the variance npt.assert_allclose(0., all_lomb['freq_varrat'], atol=5e-3) npt.assert_allclose(-np.mean(values), all_lomb['freq_y_offset'], rtol=5e-2)
def test_feature_generation(): """Compare generated features to reference values.""" this_dir = os.path.join(os.path.dirname(__file__)) test_files = [ os.path.join(this_dir, 'data/257141.dat'), os.path.join(this_dir, 'data/245486.dat'), os.path.join(this_dir, 'data/247327.dat'), ] features_extracted = None values_computed = None for i, ts_data_file_path in enumerate(test_files): t, m, e = ctt.parse_ts_data(ts_data_file_path) features = sft.generate_science_features(t, m, e, cfg.features_list_science) sorted_features = sorted(features.items()) if features_extracted is None: features_extracted = [f[0] for f in sorted_features] values_computed = np.zeros((len(test_files), len(features_extracted))) values_computed[i,:] = [f[1] for f in sorted_features] def features_from_csv(filename): with open(filename) as f: feature_names = f.readline().strip().split(",") feature_values = np.loadtxt(f, delimiter=',') return feature_names, feature_values this_dir = os.path.join(os.path.dirname(__file__)) features_expected, values_expected = features_from_csv( os.path.join(this_dir, "data/expected_features.csv")) npt.assert_equal(features_extracted, features_expected) npt.assert_array_almost_equal(values_computed, values_expected)
def test_weighted_average(): """Test weighted average and distance from weighted average features.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['weighted_average']) weighted_std_err = 1. / sum(errors**2) error_weights = 1. / (errors)**2 / weighted_std_err weighted_avg = np.average(values, weights=error_weights) npt.assert_allclose(f.values()[0], weighted_avg) dists_from_weighted_avg = values - weighted_avg stds_from_weighted_avg = (dists_from_weighted_avg / np.sqrt(weighted_std_err)) # TODO broken feature f = sft.generate_science_features(times, values, errors, ['percent_beyond_1_std']) npt.assert_equal(f.values()[0], np.mean(stds_from_weighted_avg > 1.))
def test_stetson(): """Test Stetson variability features.""" times, values, errors = irregular_random(size=201) f = sft.generate_science_features(times, values, errors, ['stetson_j']) # Stetson mean approximately equal to standard mean for large inputs dists = np.sqrt(float(len(values)) / (len(values) - 1.)) * (values - np.mean(values)) / 0.1 npt.assert_allclose(f.values()[0], np.mean(np.sign(dists**2-1)*np.sqrt(np.abs(dists**2-1))), rtol=1e-2) # Stetson_j should be somewhat close to (scaled) variance for normal data npt.assert_allclose(f.values()[0]*0.1, np.var(values), rtol=2e-1) # Hard-coded original value npt.assert_allclose(f.values()[0], 7.591347175195703) f = sft.generate_science_features(times, values, errors, ['stetson_k']) npt.assert_allclose(f.values()[0], 1./0.798 * np.mean(np.abs(dists)) / np.sqrt(np.mean(dists**2)), rtol=5e-4) # Hard-coded original value npt.assert_allclose(f.values()[0], 1.0087218792719013)
def test_scatter_res_raw(): """Test feature that measures scatter of Lomb-Scargle residuals.""" times, values, errors = irregular_random() lomb_model = sft.sf.lomb_scargle_model(times, values, errors) residuals = values - lomb_model['freq_fits'][0]['model'] resid_mad = np.median(np.abs(residuals - np.median(residuals))) value_mad = np.median(np.abs(values - np.median(values))) f = sft.generate_science_features(times, values, errors, ['scatter_res_raw']) npt.assert_allclose(f['scatter_res_raw'], resid_mad / value_mad, atol=3e-2)
def test_percent_close_to_median(): """Test feature which finds the percentage of points near the median value.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['percent_close_to_median']) amplitude = (np.abs(max(values)) - np.abs(min(values))) / 2. #amplitude = (max(values) - min(values)) / 2. within_buffer = np.abs(values - np.median(values)) < 0.2*amplitude npt.assert_allclose(f.values()[0], np.mean(within_buffer))
def test_qso_features(): """ Test features which measure fit of QSO model. Reference values are hard-coded values from previous implementation; not sure of examples with a closed-form solution. """ times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['qso_log_chi2_qsonu', 'qso_log_chi2nuNULL_chi2nu']) npt.assert_allclose(f['qso_log_chi2_qsonu'], 6.9844064754) npt.assert_allclose(f['qso_log_chi2nuNULL_chi2nu'], -0.456526327522)
def test_amplitude(): """Test features related to amplitude/magnitude percentiles.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['amplitude']) npt.assert_allclose(f.values()[0], (max(values) - min(values)) /2.) f = sft.generate_science_features(times, values, errors, ['percent_amplitude']) max_scaled = 10**(-0.4 * max(values)) min_scaled = 10**(-0.4 * min(values)) med_scaled = 10**(-0.4 * np.median(values)) peak_from_median = max(abs((max_scaled - med_scaled) / med_scaled), abs((min_scaled - med_scaled)) / med_scaled) npt.assert_allclose(f.values()[0], peak_from_median, rtol=5e-4) f = sft.generate_science_features(times, values, errors, ['percent_difference_flux_percentile']) band_offset = 13.72 w_m2 = 10**(-0.4*(values+band_offset)-3) # 1 erg/s/cm^2 = 10^-3 w/m^2 npt.assert_allclose(f.values()[0], np.diff( np.percentile(w_m2, [5, 95])) / np.median(w_m2)) f = sft.generate_science_features(times, values, errors, ['flux_percentile_ratio_mid20']) npt.assert_allclose(f.values()[0], np.diff(np.percentile(w_m2, [40, 60])) / np.diff(np.percentile(w_m2, [5, 95]))) f = sft.generate_science_features(times, values, errors, ['flux_percentile_ratio_mid35']) npt.assert_allclose(f.values()[0], np.diff(np.percentile(w_m2, [32.5, 67.5])) / np.diff(np.percentile(w_m2, [5, 95]))) f = sft.generate_science_features(times, values, errors, ['flux_percentile_ratio_mid50']) npt.assert_allclose(f.values()[0], np.diff(np.percentile(w_m2, [25, 75])) / np.diff(np.percentile(w_m2, [5, 95]))) f = sft.generate_science_features(times, values, errors, ['flux_percentile_ratio_mid65']) npt.assert_allclose(f.values()[0], np.diff(np.percentile(w_m2, [17.5, 82.5])) / np.diff(np.percentile(w_m2, [5, 95]))) f = sft.generate_science_features(times, values, errors, ['flux_percentile_ratio_mid80']) npt.assert_allclose(f.values()[0], np.diff(np.percentile(w_m2, [10, 90])) / np.diff(np.percentile(w_m2, [5, 95])))
def test_lomb_scargle_period_folding(): """ Tests for features derived from fitting a Lomb-Scargle periodic model and period-folding the data by the estimated period. """ frequencies = np.hstack((WAVE_FREQS[0], np.zeros(len(WAVE_FREQS)-1))) amplitudes = np.zeros((len(WAVE_FREQS),4)) amplitudes[0,:] = [8,4,2,1] phase = 0.1 times, values, errors = irregular_periodic(frequencies, amplitudes, phase) all_lomb = sft.generate_science_features(times, values, errors, lomb_features) # Folding is numerically unstable so we need to use the exact fitted frequency freq_est = all_lomb['freq1_freq'] # Fold by 1*period fold1ed_times = (times-times[0]) % (1./freq_est) sort_indices = np.argsort(fold1ed_times) fold1ed_times = fold1ed_times[sort_indices] fold1ed_values = values[sort_indices] # Fold by 2*period fold2ed_times = (times-times[0]) % (2./freq_est) sort_indices = np.argsort(fold2ed_times) fold2ed_times = fold2ed_times[sort_indices] fold2ed_values = values[sort_indices] npt.assert_allclose(np.sum(np.diff(fold2ed_values)**2) / np.sum(np.diff(values)**2), all_lomb['p2p_scatter_2praw']) npt.assert_allclose(np.sum(np.diff(values)**2) / ((len(values) - 1) * np.var(values)), all_lomb['p2p_ssqr_diff_over_var']) npt.assert_allclose(np.median(np.abs(np.diff(values))) / np.median(np.abs(values-np.median(values))), all_lomb['p2p_scatter_over_mad']) npt.assert_allclose(np.median(np.abs(np.diff(fold1ed_values))) / np.median(np.abs(values-np.median(values))), all_lomb['p2p_scatter_pfold_over_mad'])
def test_std(): """Test standard deviation feature.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['std']) npt.assert_allclose(f.values()[0], np.std(values))
def test_skew(): """Test statistical skew feature.""" from scipy import stats times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['skew']) npt.assert_allclose(f.values()[0], stats.skew(values))
def test_median(): """Test median value feature.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['median']) npt.assert_allclose(f.values()[0], np.median(values))
def test_median_absolute_deviation(): """Test median absolute deviation (from the median) feature.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['median_absolute_deviation']) npt.assert_allclose(f.values()[0], np.median(np.abs(values - np.median(values))))
def featurize_single_ts(ts, features_to_use, custom_script_path=None, custom_functions=None, use_docker=True): """Compute feature values for a given single time-series. Data is returned as dictionaries/lists of lists (as opposed to a more convenient DataFrame/DataSet) since it will be serialized as part of `celery_tasks.featurize_ts_file`. Parameters ---------- ts : TimeSeries object Single time series to be featurized. features_to_use : list of str List of feature names to be generated. custom_script_path : str, optional Path to custom features script .py file to be run in Docker container. custom_functions : dict, optional Dictionary of custom feature functions to be evaluated for the given time series, or a dictionary representing a dask graph of function evaluations. Dictionaries of functions should have keys `feature_name` and values functions that take arguments (t, m, e); in the case of a dask graph, these arrays should be referenced as 't', 'm', 'e', respectively, and any values with keys present in `features_to_use` will be computed. use_docker : bool, optional Bool specifying whether to generate custom features inside a Docker container. Defaults to True. Returns ------- dict Dictionary with feature names as keys, lists of feature values (one per channel) as values. """ all_feature_lists = {feature: [0.] * ts.n_channels for feature in features_to_use} for (t_i, m_i, e_i), i in zip(ts.channels(), range(ts.n_channels)): obs_features = oft.generate_obs_features(t_i, m_i, e_i, features_to_use) science_features = sft.generate_science_features(t_i, m_i, e_i, features_to_use) if custom_script_path: custom_features = cft.generate_custom_features( custom_script_path, t_i, m_i, e_i, features_already_known=dict(list(obs_features.items()) + list(science_features.items()) + list(ts.meta_features.items())), use_docker=use_docker) custom_features = {key: custom_features[key] for key in custom_features.keys() if key in features_to_use} elif custom_functions: # If all values in custom_functions are functions, evaluate each if all(hasattr(v, '__call__') for v in custom_functions.values()): custom_features = {feature: f(t_i, m_i, e_i) for feature, f in custom_functions.items() if feature in features_to_use} # Otherwise, custom_functions is a dask graph else: dask_graph = {key: value for key, value in custom_functions.items() if key in features_to_use} dask_keys = list(dask_graph.keys()) dask_graph['t'] = t_i dask_graph['m'] = m_i dask_graph['e'] = e_i dask_graph.update(dict(list(obs_features.items()) + list(science_features.items()) + list(ts.meta_features.items()))) custom_features = dict(zip(dask_keys, dask.async.get_sync(dask_graph, dask_keys))) else: custom_features = {} # We set values in this order so that custom features take priority # over MLTSP features in the case of name conflicts for feature, value in (list(obs_features.items()) + list(science_features.items()) + list(custom_features.items())): all_feature_lists[feature][i] = value return all_feature_lists
def test_min(): """Test minimum value feature.""" times, values, errors = irregular_random() f = sft.generate_science_features(times, values, errors, ['minimum']) npt.assert_equal(f.values()[0], min(values))