def test_progress_bars(): threeML_config.interface.progress_bars = 'on' toggle_progress_bars() assert not threeML_config.interface.progress_bars toggle_progress_bars() assert threeML_config.interface.progress_bars silence_progress_bars() for i in tqdm(range(10), desc="test"): pass for i in trange(1, 10, 1, desc="test"): pass assert not threeML_config.interface.progress_bars activate_progress_bars() for i in tqdm(range(10), desc="test"): pass for i in trange(1, 10, 1, desc="test"): pass assert threeML_config.interface.progress_bars
def execute_with_progress_bar(self, worker, items, chunk_size=None, name="progress"): # Let's make a wrapper which will allow us to recover the order def wrapper(x): (id, item) = x return (id, worker(item)) items_wrapped = [(i, item) for i, item in enumerate(items)] amr = self._interactive_map(wrapper, items_wrapped, ordered=False, chunk_size=chunk_size) results = [] for i, res in enumerate(tqdm(amr, desc=name)): results.append(res) # Reorder the list according to the id return list( map(lambda x: x[1], sorted(results, key=lambda x: x[0])))
def _get_errors(self): """ Override this method if the minimizer provide a function to get all errors at once. If instead it provides a method to get one error at the time, override the _get_one_error method :return: a ordered dictionary parameter_path -> (negative_error, positive_error) """ # TODO: options for other significance levels target_delta_log_like = 0.5 errors = collections.OrderedDict() p = tqdm(total=2 * len(self.parameters), desc="Computing errors") for parameter_name in self.parameters: negative_error = self._get_one_error( parameter_name, target_delta_log_like, -1 ) p.update(1) positive_error = self._get_one_error( parameter_name, target_delta_log_like, +1 ) p.update(1) errors[parameter_name] = (negative_error, positive_error) return errors
def _evaluate(self): """ calculate the best or mean fit of the new function or quantity :return: """ # if there are independent variables if self._independent_variable_range: variates = [] # scroll through the independent variables n_iterations = np.product(self._out_shape) with use_astromodels_memoization(False): variables = list( itertools.product(*self._independent_variable_range)) if len(variables) > 1: for v in tqdm(variables, desc="Propagating errors"): variates.append(self._propagated_function(*v)) else: for v in variables: variates.append(self._propagated_function(*v)) # otherwise just evaluate else: variates = self._propagated_function() # create a variates container self._propagated_variates = VariatesContainer(variates, self._out_shape, self._cl, self._transform, self._equal_tailed)
def _step2d(self, steps1, steps2): log_likes = np.zeros((len(steps1), len(steps2))) if threeML_config.interface.progress_bars: p = tqdm(total=len(steps1) * len(steps2), desc="Profiling likelihood") for i, step1 in enumerate(steps1): for j, step2 in enumerate(steps2): if self._n_free_parameters > 0: # Profile out the free parameters self._wrapper.set_fixed_values([step1, step2]) try: _, this_log_like = self._optimizer.minimize( compute_covar=False ) except FitFailed: # If the user is stepping too far it might be that the fit fails. It is usually not a # problem this_log_like = np.nan else: # No free parameters, just compute the likelihood this_log_like = self._function(step1, step2) log_likes[i, j] = this_log_like if threeML_config.interface.progress_bars: p.update(1) return log_likes
def _step1d(self, steps1): log_likes = np.zeros_like(steps1) for i, step in enumerate(tqdm(steps1, desc="Profiling likelihood")): if self._n_free_parameters > 0: # Profile out the free parameters self._wrapper.set_fixed_values(step) _, this_log_like = self._optimizer.minimize( compute_covar=False) else: # No free parameters, just compute the likelihood this_log_like = self._function(step) log_likes[i] = this_log_like return log_likes
def _unbinned_fit_polynomials(self, bayes=False): self._poly_fit_exists = True # Select all the events that are in the background regions # and make a mask all_bkg_masks = [] total_duration = 0.0 poly_exposure = 0 for selection in self._poly_intervals: total_duration += selection.duration poly_exposure += self.exposure_over_interval( selection.start_time, selection.stop_time) all_bkg_masks.append( np.logical_and( self._arrival_times >= selection.start_time, self._arrival_times <= selection.stop_time, )) poly_mask = all_bkg_masks[0] # If there are multiple masks: if len(all_bkg_masks) > 1: for mask in all_bkg_masks[1:]: poly_mask = np.logical_or(poly_mask, mask) # Select the all the events in the poly selections # We only need to do this once total_poly_events = self._arrival_times[poly_mask] # For the channel energies we will need to down select again. # We can go ahead and do this to avoid repeated computations total_poly_energies = self._measurement[poly_mask] # Now we will find the the best poly order unless the use specified one # The total cnts (over channels) is binned to .1 sec intervals if self._user_poly_order == -1: self._optimal_polynomial_grade = ( self._unbinned_fit_global_and_determine_optimum_grade( total_poly_events, poly_exposure, bayes=bayes)) log.info("Auto-determined polynomial order: %d" % self._optimal_polynomial_grade) else: self._optimal_polynomial_grade = self._user_poly_order channels = list( range(self._first_channel, self._n_channels + self._first_channel)) # Check whether we are parallelizing or not t_start = self._poly_intervals.start_times t_stop = self._poly_intervals.stop_times if threeML_config["parallel"]["use_parallel"]: def worker(channel): channel_mask = total_poly_energies == channel # Mask background events and current channel # poly_chan_mask = np.logical_and(poly_mask, channel_mask) # Select the masked events current_events = total_poly_events[channel_mask] polynomial, _ = unbinned_polyfit( current_events, self._optimal_polynomial_grade, t_start, t_stop, poly_exposure, bayes=bayes) return polynomial client = ParallelClient() polynomials = client.execute_with_progress_bar( worker, channels, name=f"Fitting {self._instrument} background") else: polynomials = [] for channel in tqdm(channels, desc=f"Fitting {self._instrument} background"): channel_mask = total_poly_energies == channel # Mask background events and current channel # poly_chan_mask = np.logical_and(poly_mask, channel_mask) # Select the masked events current_events = total_poly_events[channel_mask] polynomial, _ = unbinned_polyfit( current_events, self._optimal_polynomial_grade, t_start, t_stop, poly_exposure, bayes=bayes) polynomials.append(polynomial) # We are now ready to return the polynomials self._polynomials = polynomials
def _fit_polynomials(self, bayes=False): """ fits a polynomial to all channels over the input time intervals :param fit_intervals: str input intervals :return: """ # mark that we have fit a poly now self._poly_fit_exists = True # we need to adjust the selection to the true intervals of the time-binned spectra tmp_poly_intervals = self._poly_intervals poly_intervals = self._adjust_to_true_intervals(tmp_poly_intervals) self._poly_intervals = poly_intervals # now lets get all the counts, exposure and midpoints for the # selection selected_counts = [] selected_exposure = [] selected_midpoints = [] for selection in poly_intervals: # get the mask of these bins mask = self._select_bins(selection.start_time, selection.stop_time) # the counts will be (time, channel) here, # so the mask is selecting time. # a sum along axis=0 is a sum in time, while axis=1 is a sum in energy selected_counts.extend( self._binned_spectrum_set.counts_per_bin[mask]) selected_exposure.extend( self._binned_spectrum_set.exposure_per_bin[mask]) selected_midpoints.extend( self._binned_spectrum_set.time_intervals.mid_points[mask] ) selected_counts = np.array(selected_counts) selected_midpoints = np.array(selected_midpoints) selected_exposure = np.array(selected_exposure) # Now we will find the the best poly order unless the use specified one # The total cnts (over channels) is binned if self._user_poly_order == -1: self._optimal_polynomial_grade = ( self._fit_global_and_determine_optimum_grade( selected_counts.sum(axis=1), selected_midpoints, selected_exposure, bayes=bayes, ) ) log.info( "Auto-determined polynomial order: %d" % self._optimal_polynomial_grade ) else: self._optimal_polynomial_grade = self._user_poly_order if threeML_config["parallel"]["use_parallel"]: def worker(counts): with silence_console_log(): polynomial, _ = polyfit( selected_midpoints, counts, self._optimal_polynomial_grade, selected_exposure, bayes=bayes, ) return polynomial client = ParallelClient() polynomials = client.execute_with_progress_bar( worker, selected_counts.T, name=f"Fitting {self._instrument} background") else: polynomials = [] # now fit the light curve of each channel # and save the estimated polynomial for counts in tqdm( selected_counts.T, desc=f"Fitting {self._instrument} background" ): with silence_console_log(): polynomial, _ = polyfit( selected_midpoints, counts, self._optimal_polynomial_grade, selected_exposure, bayes=bayes, ) polynomials.append(polynomial) self._polynomials = polynomials
def _minimize(self): assert (len(self._grid) > 0), "You need to set up a grid using add_parameter_to_grid" if self._2nd_minimization is None: raise RuntimeError( "You did not setup this global minimizer (GRID). You need to use the .setup() method" ) # For each point in the grid, perform a fit parameters = list(self._grid.keys()) overall_minimum = 1e20 internal_best_fit_values = None n_iterations = np.prod([x.shape for x in list(self._grid.values())]) if threeML_config.interface.progress_bars: p = tqdm(total=n_iterations, desc="Grid Minimization") for values_tuple in itertools.product(*list(self._grid.values())): # Reset everything to the original values, so that the fit will always start # from there, instead that from the values obtained in the last iterations, which # might have gone completely awry for par_name, par_value in self._original_values.items(): self.parameters[par_name].value = par_value # Now set the parameters in the grid to their starting values for i, this_value in enumerate(values_tuple): self.parameters[parameters[i]].value = this_value # Get a new instance of the minimizer. We need to do this instead of reusing an existing instance # because some minimizers (like iminuit) keep internal track of their status, so that reusing # a minimizer will create correlation between the different points # NOTE: this line necessarily needs to be after the values of the parameters has been set to the # point, because the init method of the minimizer instance will use those values to set the starting # point for the fit _minimizer = self._2nd_minimization.get_instance(self.function, self.parameters, verbosity=0) # Perform fit try: # We call _minimize() and not minimize() so that the best fit values are # in the internal system. this_best_fit_values_internal, this_minimum = _minimizer._minimize( ) except: # A failure is not a problem here, only if all of the fit fail then we have a problem # but this case is handled later continue # If this minimum is the overall minimum, save the result if this_minimum < overall_minimum: overall_minimum = this_minimum internal_best_fit_values = this_best_fit_values_internal # Use callbacks (if any) for callback in self._callbacks: callback(values_tuple, this_minimum) if threeML_config.interface.progress_bars: p.update(1) if internal_best_fit_values is None: log.error("All fit starting from values in the grid have failed!") raise AllFitFailed() return internal_best_fit_values, overall_minimum
def _setup_analysis_dictionaries( analysis_results, energy_range, energy_unit, flux_unit, use_components, components_to_use, confidence_level, equal_tailed, differential, sources_to_use, include_extended, ): """ helper function to pull out analysis details that are common to flux and plotting functions :param analysis_results: :param energy_range: :param energy_unit: :param flux_unit: :param use_components: :param components_to_use: :param confidence_level: :param fraction_of_samples: :param differential: :param sources_to_use: :param include_extended: :return: """ bayesian_analyses = collections.OrderedDict() mle_analyses = collections.OrderedDict() # first we split up the bayesian and mle analysis mle_sources = collections.OrderedDict() bayes_sources = collections.OrderedDict() for analysis in analysis_results: items = (list(analysis.optimized_model.point_sources.items()) if not include_extended else list( analysis.optimized_model.sources.items())) for source_name, source in items: if source_name in sources_to_use or not sources_to_use: if analysis.analysis_type == "MLE": # keep track of duplicate sources mle_sources.setdefault(source_name, []).append(1) if len(mle_sources[source_name]) > 1: name = "%s_%d" % (source_name, len(mle_sources[source_name])) else: name = source_name try: comps = [ c.name for c in source.spectrum.main.composite.functions ] except: comps = [] # duplicate components comps = [ "%s_n%i" % (s, suffix) if num > 1 else s for s, num in list(collections.Counter(comps).items()) for suffix in range(1, num + 1) ] mle_analyses[name] = { "source": source_name, "analysis": analysis, "component_names": comps, } else: bayes_sources.setdefault(source_name, []).append(1) # keep track of duplicate sources if len(bayes_sources[source_name]) > 1: name = "%s_%d" % (source_name, len(bayes_sources[source_name])) else: name = source_name try: comps = [ c.name for c in source.spectrum.main.composite.functions ] except: comps = [] # duplicate components comps = [ "%s_n%i" % (s, suffix) if num > 1 else s for s, num in list(collections.Counter(comps).items()) for suffix in range(1, num + 1) ] bayesian_analyses[name] = { "source": source_name, "analysis": analysis, "component_names": comps, } # keep track of the number of sources we will use num_sources_to_use = 0 # go through the MLE analysis and build up some fitted sources for key in tqdm(list(mle_analyses.keys()), desc="processing MLE analyses"): # if we want to use this source if (not use_components or ("total" in components_to_use) or (not mle_analyses[key]["component_names"])): mle_analyses[key][ "fitted point source"] = FittedPointSourceSpectralHandler( mle_analyses[key]["analysis"], mle_analyses[key]["source"], energy_range, energy_unit, flux_unit, confidence_level, equal_tailed=equal_tailed, is_differential_flux=differential, ) num_sources_to_use += 1 # see if there are any components to use if use_components: num_components_to_use = 0 component_dict = {} for component in mle_analyses[key]["component_names"]: # if we want to plot all the components if not components_to_use: component_dict[ component] = FittedPointSourceSpectralHandler( mle_analyses[key]["analysis"], mle_analyses[key]["source"], energy_range, energy_unit, flux_unit, confidence_level, equal_tailed, component=component, is_differential_flux=differential, ) num_components_to_use += 1 else: # otherwise pick off only the ones of interest if component in components_to_use: component_dict[ component] = FittedPointSourceSpectralHandler( mle_analyses[key]["analysis"], mle_analyses[key]["source"], energy_range, energy_unit, flux_unit, confidence_level, equal_tailed, component=component, is_differential_flux=differential, ) num_components_to_use += 1 # save these to the dict mle_analyses[key]["components"] = component_dict # keep track of how many components we need to plot if use_components: num_sources_to_use += num_components_to_use if "total" in components_to_use: num_sources_to_use += 1 # else: # # num_sources_to_use += 1 # repeat for the bayes analyses for key in tqdm(list(bayesian_analyses.keys()), desc="processing Bayesian analyses"): # if we have a source to use if (not use_components or ("total" in components_to_use) or (not bayesian_analyses[key]["component_names"])): bayesian_analyses[key][ "fitted point source"] = FittedPointSourceSpectralHandler( bayesian_analyses[key]["analysis"], bayesian_analyses[key]["source"], energy_range, energy_unit, flux_unit, confidence_level, equal_tailed, is_differential_flux=differential, ) num_sources_to_use += 1 # if we want to use components if use_components: num_components_to_use = 0 component_dict = {} for component in bayesian_analyses[key]["component_names"]: # extracting all components if not components_to_use: component_dict[ component] = FittedPointSourceSpectralHandler( bayesian_analyses[key]["analysis"], bayesian_analyses[key]["source"], energy_range, energy_unit, flux_unit, confidence_level, equal_tailed, component=component, is_differential_flux=differential, ) num_components_to_use += 1 # or just some of them if component in components_to_use: component_dict[ component] = FittedPointSourceSpectralHandler( bayesian_analyses[key]["analysis"], bayesian_analyses[key]["source"], energy_range, energy_unit, flux_unit, confidence_level, equal_tailed, component=component, is_differential_flux=differential, ) num_components_to_use += 1 bayesian_analyses[key]["components"] = component_dict # keep track of everything we added on if use_components and num_components_to_use > 0: num_sources_to_use += num_components_to_use if "total" in components_to_use: num_sources_to_use += 1 # # else: # # num_sources_to_use += 1 # we may have the same source in a bayesian and mle analysis. # we want to plot them, but make sure to label them differently. # so let's keep track of them duplicate_keys = [] for key in list(mle_analyses.keys()): if key in list(bayesian_analyses.keys()): duplicate_keys.append(key) return mle_analyses, bayesian_analyses, num_sources_to_use, duplicate_keys
def _fit_polynomials(self, bayes=False): """ Binned fit to each channel. Sets the polynomial array that will be used to compute counts over an interval :return: """ self._poly_fit_exists = True # Select all the events that are in the background regions # and make a mask all_bkg_masks = [] for selection in self._poly_intervals: all_bkg_masks.append( np.logical_and( self._arrival_times >= selection.start_time, self._arrival_times <= selection.stop_time, )) poly_mask = all_bkg_masks[0] # If there are multiple masks: if len(all_bkg_masks) > 1: for mask in all_bkg_masks[1:]: poly_mask = np.logical_or(poly_mask, mask) # Select the all the events in the poly selections # We only need to do this once total_poly_events = self._arrival_times[poly_mask] # For the channel energies we will need to down select again. # We can go ahead and do this to avoid repeated computations total_poly_energies = self._measurement[poly_mask] # This calculation removes the unselected portion of the light curve # so that we are not fitting zero counts. It will be used in the channel calculations # as well bin_width = 1.0 # seconds these_bins = np.arange(self._start_time, self._stop_time, bin_width) cnts, bins = np.histogram(total_poly_events, bins=these_bins) # Find the mean time of the bins and calculate the exposure in each bin mean_time = [] exposure_per_bin = [] for i in range(len(bins) - 1): m = np.mean((bins[i], bins[i + 1])) mean_time.append(m) exposure_per_bin.append( self.exposure_over_interval(bins[i], bins[i + 1])) mean_time = np.array(mean_time) exposure_per_bin = np.array(exposure_per_bin) # Remove bins with zero counts all_non_zero_mask = [] for selection in self._poly_intervals: all_non_zero_mask.append( np.logical_and(mean_time >= selection.start_time, mean_time <= selection.stop_time)) non_zero_mask = all_non_zero_mask[0] if len(all_non_zero_mask) > 1: for mask in all_non_zero_mask[1:]: non_zero_mask = np.logical_or(mask, non_zero_mask) # Now we will find the the best poly order unless the use specified one # The total cnts (over channels) is binned to .1 sec intervals if self._user_poly_order == -1: self._optimal_polynomial_grade = ( self._fit_global_and_determine_optimum_grade( cnts[non_zero_mask], mean_time[non_zero_mask], exposure_per_bin[non_zero_mask], bayes=bayes)) log.info("Auto-determined polynomial order: %d" % self._optimal_polynomial_grade) else: self._optimal_polynomial_grade = self._user_poly_order channels = list( range(self._first_channel, self._n_channels + self._first_channel)) if threeML_config["parallel"]["use_parallel"]: def worker(channel): channel_mask = total_poly_energies == channel # Mask background events and current channel # poly_chan_mask = np.logical_and(poly_mask, channel_mask) # Select the masked events current_events = total_poly_events[channel_mask] cnts, bins = np.histogram(current_events, bins=these_bins) polynomial, _ = polyfit(mean_time[non_zero_mask], cnts[non_zero_mask], self._optimal_polynomial_grade, exposure_per_bin[non_zero_mask], bayes=bayes) return polynomial client = ParallelClient() polynomials = client.execute_with_progress_bar( worker, channels, name=f"Fitting {self._instrument} background") else: polynomials = [] for channel in tqdm(channels, desc=f"Fitting {self._instrument} background"): channel_mask = total_poly_energies == channel # Mask background events and current channel # poly_chan_mask = np.logical_and(poly_mask, channel_mask) # Select the masked events current_events = total_poly_events[channel_mask] # now bin the selected channel counts cnts, bins = np.histogram(current_events, bins=these_bins) # Put data to fit in an x vector and y vector polynomial, _ = polyfit(mean_time[non_zero_mask], cnts[non_zero_mask], self._optimal_polynomial_grade, exposure_per_bin[non_zero_mask], bayes=bayes) polynomials.append(polynomial) # We are now ready to return the polynomials self._polynomials = polynomials
def bayesian_blocks_not_unique(tt, ttstart, ttstop, p0): # Verify that the input array is one-dimensional tt = np.asarray(tt, dtype=float) assert tt.ndim == 1 # Now create the array of unique times unique_t = np.unique(tt) t = tt tstart = ttstart tstop = ttstop # Create initial cell edges (Voronoi tessellation) using the unique time stamps edges = np.concatenate([[tstart], 0.5 * (unique_t[1:] + unique_t[:-1]), [tstop]]) # The last block length is 0 by definition block_length = tstop - edges if np.sum((block_length <= 0)) > 1: raise RuntimeError( "Events appears to be out of order! Check for order, or duplicated events." ) N = unique_t.shape[0] # arrays to store the best configuration best = np.zeros(N, dtype=float) last = np.zeros(N, dtype=int) # Pre-computed priors (for speed) # eq. 21 from Scargle 2012 priors = 4 - np.log(73.53 * p0 * np.power(np.arange(1, N + 1), -0.478)) # Count how many events are in each Voronoi cell x, _ = np.histogram(t, edges) # Speed tricks: resolve once for all the functions which will be used # in the loop cumsum = np.cumsum log = np.log argmax = np.argmax numexpr_evaluate = numexpr.evaluate arange = np.arange # Decide the step for reporting progress incr = max(int(float(N) / 100.0 * 10), 1) logger.debug("Finding blocks...") # This is where the computation happens. Following Scargle et al. 2012. # This loop has been optimized for speed: # * the expression for the fitness function has been rewritten to # avoid multiple log computations, and to avoid power computations # * the use of scipy.weave and numexpr has been evaluated. The latter # gives a big gain (~40%) if used for the fitness function. No other # gain is obtained by using it anywhere else # Set numexpr precision to low (more than enough for us), which is # faster than high oldaccuracy = numexpr.set_vml_accuracy_mode("low") numexpr.set_num_threads(1) numexpr.set_vml_num_threads(1) for R in tqdm(range(N)): br = block_length[R + 1] T_k = block_length[:R + 1] - br # N_k: number of elements in each block # This expression has been simplified for the case of # unbinned events (i.e., one element in each block) # It was: N_k = cumsum(x[:R + 1][::-1])[::-1] # Now it is: # N_k = arange(R + 1, 0, -1) # Evaluate fitness function # This is the slowest part, which I'm speeding up by using # numexpr. It provides a ~40% gain in execution speed. fit_vec = numexpr_evaluate( """N_k * log(N_k/ T_k) """, optimization="aggressive", local_dict={ "N_k": N_k, "T_k": T_k }, ) p = priors[R] A_R = fit_vec - p A_R[1:] += best[:R] i_max = argmax(A_R) last[R] = i_max best[R] = A_R[i_max] numexpr.set_vml_accuracy_mode(oldaccuracy) logger.debug("Done\n") # Now find blocks change_points = np.zeros(N, dtype=int) i_cp = N ind = N while True: i_cp -= 1 change_points[i_cp] = ind if ind == 0: break ind = last[ind - 1] change_points = change_points[i_cp:] finalEdges = edges[change_points] return np.asarray(finalEdges)
def __init__(self): """ holds all the observatories/instruments/filters :param library_file: """ # get the filter file with h5py.File(get_speclite_filter_library(), "r") as f: self._instruments = [] for observatory in tqdm(f.keys(), desc="Loading photometric filters"): log.debug(f"loading {observatory}") sub_dict = {} for instrument in f[observatory].keys(): sub_dict[instrument] = instrument # create a node for the observatory this_node = ObservatoryNode(sub_dict) # attach it to the object if observatory == "2MASS": xx = "TwoMass" else: xx = observatory setattr(self, xx, this_node) # now get the instruments for instrument in f[observatory].keys(): # update the instruments self._instruments.append(instrument) # create the filter response via speclite this_grp = f[observatory][instrument] filters = [] for ff in this_grp.keys(): grp = this_grp[ff] this_filter = spec_filter.FilterResponse( wavelength=grp["wavelength"][()] * u.Angstrom, response=grp["transmission"][()], meta=dict( group_name=instrument, band_name=ff, )) filters.append(this_filter) fgroup = spec_filter.FilterSequence(filters) # attach the filters to the observatory setattr(this_node, instrument, fgroup) self._instruments.sort()
def compute_ppc(analysis: BayesianAnalysis, result: BayesianResults, n_sims: int, file_name: str, overwrite: bool = False, return_ppc: bool = False) -> Union["PPC", None]: """ Compute a posterior predictive check from a 3ML DispersionLike Plugin. The resulting posterior data simulations are stored in an HDF5 file which can be read by the PPC class :param analysis: 3ML bayesian analysis object :param result: 3ML analysis result :param n_sims: the number of posterior simulations to create :param file_name: the filename to save to :param overwrite: to overwrite an existsing file :param return_ppc: if true, PPC object will be return directy :returns: None :rtype: """ update_logging_level("WARNING") p = Path(file_name) if p.exists() and (not overwrite): raise RuntimeError(f"{file_name} already exists!") with h5py.File(file_name, 'w', libver='latest') as database: # first we collect the real data data and save it so that we will not have to # look it up in the future data_names = [] database.attrs['n_sims'] = n_sims for data in analysis.data_list.values(): data_names.append(data.name) grp = database.create_group(data.name) grp.attrs['exposure'] = data.exposure grp.create_dataset('ebounds', data=data.response.ebounds, compression='lzf') grp.create_dataset('obs_counts', data=data.observed_counts, compression='lzf') grp.create_dataset('bkg_counts', data=data.background_counts, compression='lzf') grp.create_dataset('mask', data=data.mask, compression='lzf') # select random draws from the posterior n_samples = len(result.samples.T) if n_samples < n_sims: print("too many sims") n_sims = n_samples choices = np.random.choice(len(result.samples.T), replace=False, size=n_sims) # for each posterior sample with silence_console_log(and_progress_bars=False): for j, choice in enumerate(tqdm(choices, desc="sampling posterior")): # get the parameters of the choice params = result.samples.T[choice] # set the analysis free parameters to the value of the posterior for i, (k, v) in enumerate( analysis.likelihood_model.free_parameters.items()): v.value = params[i] # create simulated data sets with these free parameters sim_dl = DataList(*[ data.get_simulated_dataset() for data in analysis.data_list.values() ]) # set the model of the simulated data to the model of the simulation for i, data in enumerate(sim_dl.values()): # clone the model for saftey's sake # and set the model. For now we do nothing with this data.set_model(clone_model(analysis.likelihood_model)) # store the PPC data in the file grp = database[data_names[i]] grp.create_dataset('ppc_counts_%d' % j, data=data.observed_counts, compression='lzf') grp.create_dataset('ppc_background_counts_%d' % j, data=data.background_counts, compression='lzf') # sim_dls.append(sim_dl) if return_ppc: return PPC(file_name)
def bin_by_significance( cls, arrival_times, background_getter, background_error_getter=None, sigma_level=10, min_counts=1, tstart=None, tstop=None, ): """ Bin the data to a given significance level for a given background method and sigma method. If a background error function is given then it is assumed that the error distribution is gaussian. Otherwise, the error distribution is assumed to be Poisson. :param background_getter: function of a start and stop time that returns background counts :param background_error_getter: function of a start and stop time that returns background count errors :param sigma_level: the sigma level of the intervals :param min_counts: the minimum counts per bin :return: """ if tstart is None: tstart = arrival_times.min() else: tstart = float(tstart) if tstop is None: tstop = arrival_times.max() else: tstop = float(tstop) starts = [] stops = [] # Switching to a fast search # Idea inspired by Damien Begue # these factors change the time steps # in the fast search. should experiment if sigma_level > 25: increase_factor = 0.5 decrease_factor = 0.5 else: increase_factor = 0.25 decrease_factor = 0.25 current_start = arrival_times[0] # first we need to see if the interval provided has enough counts _, counts = TemporalBinner._select_events(arrival_times, current_start, arrival_times[-1]) # if it does not, the flag for the big loop never gets set end_all_search = not TemporalBinner._check_exceeds_sigma_interval( current_start, arrival_times[-1], counts, sigma_level, background_getter, background_error_getter, ) # We will start the search at the mid point of the whole interval mid_point = 0.5 * (arrival_times[-1] + current_start) current_stop = mid_point # initialize the fast search flag end_fast_search = False # resolve once for functions used in the loop searchsorted = np.searchsorted # this is the main loop # as long as we have not reached the end of the interval # the loop will run if threeML_config.interface.progress_bars: pbar = tqdm(total=arrival_times.shape[0], desc="Binning by significance") while not end_all_search: # start of the fast search # we reset the flag for the interval # having been decreased in the last pass decreased_interval = False while not end_fast_search: # we calculate the sigma of the current region _, counts = TemporalBinner._select_events( arrival_times, current_start, current_stop) sigma_exceeded = TemporalBinner._check_exceeds_sigma_interval( current_start, current_stop, counts, sigma_level, background_getter, background_error_getter, ) time_step = abs(current_stop - current_start) # if we do not exceed the sigma # we need to increase the time interval if not sigma_exceeded: # however, if in the last pass we had to decrease # the interval, it means we have found where we # we need to start the slow search if decreased_interval: # mark where we are in the list start_idx = searchsorted(arrival_times, current_stop) # end the fast search end_fast_search = True # otherwise we increase the interval else: # unless, we would increase it too far if (current_stop + time_step * increase_factor) >= arrival_times[-1]: # mark where we are in the interval start_idx = searchsorted(arrival_times, current_stop) # then we also want to go ahead and get out of the fast search end_fast_search = True else: # increase the interval current_stop += time_step * increase_factor # if we did exceede the sigma level we will need to step # back in time to find where it was NOT exceeded else: # decrease the interval current_stop -= time_step * decrease_factor # inform the loop that we have been back stepping decreased_interval = True # Now we are ready for the slow forward search # where we count up all the photons # we have already counted up the photons to this point total_counts = counts # start searching from where the fast search ended if threeML_config.interface.progress_bars: pbar.update(counts) for time in arrival_times[start_idx:]: total_counts += 1 if threeML_config.interface.progress_bars: pbar.update(1) if total_counts < min_counts: continue else: # first use the background function to know the number of background counts bkg = background_getter(current_start, time) sig = Significance(total_counts, bkg) if background_error_getter is not None: bkg_error = background_error_getter( current_start, time) sigma = sig.li_and_ma_equivalent_for_gaussian_background( bkg_error)[0] else: sigma = sig.li_and_ma()[0] # now test if we have enough sigma if sigma >= sigma_level: # if we succeeded we want to mark the time bins stops.append(time) starts.append(current_start) # set up the next fast search # by looking past this interval current_start = time current_stop = 0.5 * (arrival_times[-1] + time) end_fast_search = False # get out of the for loop break # if we never exceeded the sigma level by the # end of the search, we never will if end_fast_search: # so lets kill the main search end_all_search = True if not starts: log.error( "The requested sigma level could not be achieved in the interval. Try decreasing it." ) else: return cls.from_starts_and_stops(starts, stops)
def download( self, remote_filename, destination_path: str, new_filename=None, progress=True, compress=False, ): assert (remote_filename in self.files ), "File %s is not contained in this directory (%s)" % ( remote_filename, self._request_result.url, ) destination_path: Path = sanitize_filename(destination_path, abspath=True) assert path_exists_and_is_directory(destination_path), ( f"Provided destination {destination_path} does not exist or " "is not a directory") # If no filename is specified, use the same name that the file has on the remote server if new_filename is None: new_filename: str = remote_filename.split("/")[-1] # Get the fully qualified path for the remote and the local file remote_path: str = self._request_result.url + remote_filename local_path: Path = destination_path / new_filename # Ask the server for the file, but do not download it just yet # (stream=True will get the HTTP header but nothing else) # Use stream=True for two reasons: # * so that the file is not downloaded all in memory before being written to the disk # * so that we can report progress is requested this_request = requests.get(remote_path, stream=True) # Figure out the size of the file file_size = int(this_request.headers["Content-Length"]) log.debug(f"downloading {remote_filename} of size {file_size}") # Now check if we really need to download this file if compress: # Add a .gz at the end of the file path log.debug( f"file {remote_filename} will be downloaded and compressed") local_path: Path = Path(f"{local_path}.gz") if file_existing_and_readable(local_path): local_size = os.path.getsize(local_path) if local_size == file_size or compress: # if the compressed file already exists # it will have a smaller size # No need to download it again log.info(f"file {remote_filename} is already downloaded!") return local_path if local_path.is_file(): first_byte = os.path.getsize(local_path) else: first_byte = 0 # Chunk size shouldn't bee too small otherwise we are causing a bottleneck in the download speed chunk_size = 1024 * 10 # If the user wants to compress the file, use gzip, otherwise the normal opener if compress: import gzip opener = gzip.open else: opener = open if threeML_config["interface"]["progress_bars"]: # Set a title for the progress bar bar_title = "Downloading %s" % new_filename total_size = int(this_request.headers.get('content-length', 0)) bar = tqdm( initial=first_byte, unit_scale=True, unit_divisor=1024, unit="B", total=int(this_request.headers["Content-Length"]), desc=bar_title, ) with opener(local_path, "wb") as f: for chunk in this_request.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) bar.update(len(chunk)) this_request.close() bar.close() else: with opener(local_path, "wb") as f: for chunk in this_request.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) this_request.close() return local_path
def download_files_from_directory_ftp(ftp_url, destination_directory, filenames=None, namefilter=None): # Parse url tokens = urllib.parse.urlparse(ftp_url) serverAddress = tokens.netloc directory = tokens.path # if no filename has been specified, connect first to retrieve the list of files to download if filenames == None: # Connect to server and log in ftp = ftplib.FTP(serverAddress, "anonymous", "", "", timeout=60) try: ftp.login() except: # Maybe we are already logged in try: ftp.cwd("/") except: # nope! don't know what is happening raise # Move to origin directory ftp.cwd(directory) # Retrieve list of files filenames = [] ftp.retrlines("NLST", filenames.append) # Close connection (will reopen later) ftp.close() # Download files with progress report downloaded_files = [] for i, filename in enumerate(tqdm(filenames)): if namefilter != None and filename.find(namefilter) < 0: # Filename does not match, do not download it continue else: local_filename = os.path.join(destination_directory, filename) urllib.request.urlretrieve( "ftp://%s/%s/%s" % (serverAddress, directory, filename), local_filename, ) urllib.request.urlcleanup() downloaded_files.append(local_filename) return downloaded_files