def profile_main(): qso_record_table = table.Table(np.load(settings.get_qso_metadata_npy())) flag_stats = FlagStats() # assume qso_record_table is already sorted spec_sample = read_spectrum_fits.enum_spectra(qso_record_table, pre_sort=False, flag_stats=flag_stats) qso_spectra_hdf5 = settings.get_qso_spectra_hdf5() output_spectra = Hdf5SpectrumContainer(qso_spectra_hdf5, readonly=False, create_new=True, num_spectra=MAX_SPECTRA) if settings.get_single_process(): result_enum = map(save_spectrum, spec_sample) else: assert False, "Not supported" for i in result_enum: index = i[0] output_spectra.set_wavelength(index, i[1]) output_spectra.set_flux(index, i[2]) output_spectra.set_ivar(index, i[3]) for bit in range(0, 32): print(flag_stats.to_string(bit)) print('Total count: ' + str(flag_stats.pixel_count))
def remove_median(delta_t, ar_delta_t_median, ar_z): """ Remove the median of the delta transmittance per redshift bin. The change is made in-place. :return: """ # remove nan values (redshift bins with a total weight of 0) mask = ar_ivar_total != 0 # calculate the mean of the delta transmittance per redshift bin. ar_median_no_nan = ar_delta_t_median[mask] ar_z_no_nan = ar_z[mask] empty_array = np.array([]) n = 0 # remove the mean (in-place) for i in range(delta_t.num_spectra): ar_wavelength = delta_t.get_wavelength(i) ar_flux = delta_t.get_flux(i) ar_ivar = delta_t.get_ivar(i) if ar_wavelength.size: ar_delta_t_correction = np.interp(ar_wavelength, ar_z_no_nan, ar_median_no_nan, 0, 0) delta_t.set_wavelength(i, ar_wavelength) delta_t.set_flux(i, ar_flux - ar_delta_t_correction) delta_t.set_ivar(i, ar_ivar) n += 1 else: delta_t.set_wavelength(i, empty_array) delta_t.set_flux(i, empty_array) delta_t.set_ivar(i, empty_array)
def get_weighted_median(self, weighted=True): ar_median_weights = self.ar_flux_bins if weighted else self.ar_unweighted_flux_bins res = np.zeros(self.ar_z.size) for n in range(self.ar_z.size): res[n] = weighted_module.median(np.arange(self.flux_res), ar_median_weights[n]) return res / self.flux_res * self.flux_range + self.flux_offset
def profile_main(): galaxy_metadata_file_npy = settings.get_galaxy_metadata_npy() histogram_output_npz = settings.get_ism_histogram_npz() galaxy_record_table = table.Table(np.load(galaxy_metadata_file_npy)) num_extinction_bins = settings.get_num_extinction_bins() extinction_field_name = settings.get_extinction_source() ism_object_classes = settings.get_ism_object_classes() galaxy_table_mask = np.array( [i in ism_object_classes for i in galaxy_record_table['class']]) galaxy_record_table = galaxy_record_table[galaxy_table_mask] # group results into extinction bins with roughly equal number of spectra. galaxy_record_table.sort([extinction_field_name]) # remove objects with unknown extinction galaxy_record_table = galaxy_record_table[np.where( np.isfinite(galaxy_record_table[extinction_field_name]))] chunk_sizes, chunk_offsets = get_chunks(len(galaxy_record_table), num_extinction_bins) for i in range(num_extinction_bins): extinction_bin_start = chunk_offsets[i] extinction_bin_end = extinction_bin_start + chunk_sizes[i] extinction_bin_record_table = galaxy_record_table[ extinction_bin_start:extinction_bin_end] # this should be done before plate sort group_parameters = { 'extinction_bin_number': i, 'extinction_minimum': extinction_bin_record_table[extinction_field_name][0], 'extinction_maximum': extinction_bin_record_table[extinction_field_name][-1], 'extinction_average': np.mean(extinction_bin_record_table[extinction_field_name]), 'extinction_median': np.median(extinction_bin_record_table[extinction_field_name]), } # sort by plate to avoid constant switching of fits files (which are per plate). extinction_bin_record_table.sort(['plate', 'mjd', 'fiberID']) base_filename, file_extension = splitext(histogram_output_npz) histogram_output_filename = '{}_{:02d}{}'.format( base_filename, i, file_extension) r_print('Starting extinction bin {}'.format(i)) calc_median_spectrum(extinction_bin_record_table, histogram_output_filename, group_parameters=group_parameters) r_print('Finished extinction bin {}'.format(i))
def profile_main(): t_ = create_qso_table() fill_qso_table(t_) t_.sort(['plate', 'mjd', 'fiberID']) # add indices after sort t_['index'] = range(len(t_)) np.save(settings.get_qso_metadata_npy(), t_)
def int_to_string(cls, flags): bit_string = '' for i in range(32): if flags & 1: if bit_string: bit_string += '|' bit_string += (cls.FlagNames[i]) flags >>= 1 return bit_string
def rolling_weighted_median(ar_data, ar_weights, box_size): ar_flux_smoothed = np.zeros_like(ar_data) box_size_lower = -(box_size // 2) box_size_upper = box_size // 2 + (box_size & 1) for j in range(ar_data.size): start = max(j + box_size_lower, 0) end = min(j + box_size_upper, ar_data.size) ar_flux_smoothed[j] = weighted.median(ar_data[start:end], ar_weights[start:end]) return ar_flux_smoothed
def l_print(*args): """ print message on each node, synchronized :param args: :return: """ for rank in range(0, comm.size): comm.Barrier() if rank == comm.rank: l_print_no_barrier(*args) comm.Barrier()
def add_flux_pre_binned(self, ar_flux, ar_mask, ar_weights): for n in range(self.ar_z.size): if ar_mask[n]: ar_effective_weight = ar_weights[n] ar_effective_flux = np.asarray(ar_flux[n]) ar_normalized_flux = ar_effective_flux / self.flux_range - self.flux_offset ar_flux_index_float = np.clip(ar_normalized_flux, self.flux_min, self.flux_max) * self.flux_res ar_flux_index = np.clip(ar_flux_index_float.astype(int), 0, self.flux_res - 1) self.ar_flux_bins[n, ar_flux_index] += ar_effective_weight self.ar_unweighted_flux_bins[n, ar_flux_index] += 1
def get_bundles(start, end, size): """ split a range into bundles. each bundle is a tuple with an offset and size. :type start: int :type end: int :type size: int :rtype: tuple(int, int) """ offsets = range(start, end, size) sizes = [size] * len(offsets) sizes[-1] = end - offsets[-1] return zip(offsets, sizes)
def nu_boxcar(x, y, x_left_func, x_right_func, weights=None): y_boxcar = np.zeros_like(y) if weights is None: weights = np.ones_like(x) for n in range(x.size): x_left = np.searchsorted(x, x_left_func(x[n])) x_right = np.searchsorted(x, x_right_func(x[n])) box_weights = weights[x_left:x_right] if box_weights.sum() > 0: y_boxcar[n] = np.average(y[x_left:x_right], weights=weights[x_left:x_right]) else: y_boxcar[n] = y[n] return y_boxcar
def calc_fit_power_law(delta_f_snr_bins=snr_stats_total): snr_bins = delta_f_snr_bins_helper.get_log_snr_axis() y_quantile = np.zeros_like(snr_bins) y1 = delta_f_snr_bins_helper.get_delta_f_axis() for i in range(50): y_quantile[i] = weighted.quantile(y1, delta_f_snr_bins[i], .9) mask = [np.logical_and(-0 < snr_bins, snr_bins < 3)] masked_snr_bins = snr_bins[mask] # print("x2:", masked_snr_bins) fit_params = lmfit.Parameters() fit_params.add('a', -2., min=-5, max=-1) fit_params.add('b', 1., min=0.1, max=20.) fit_params.add('c', 0.08, min=0, max=0.2) fit_params.add('d', 3, min=-5, max=5) fit_result = lmfit.minimize(fit_function, fit_params, kws={'data': y_quantile[mask], 'x': masked_snr_bins}) return fit_result, snr_bins, masked_snr_bins, y_quantile
def accumulate(self, result_enum, ar_qso_indices_list, object_all_results): for ar_continua, ar_qso_indices, object_result in zip( result_enum, ar_qso_indices_list, object_all_results): continua = ContinuumFitContainer.from_np_array_and_object( ar_continua, object_result) # array based mpi gather returns zeros at the end of the global array. # use the fact that the object based gather returns the correct number of elements: num_spectra = len(object_result) for n in range(num_spectra): index = ar_qso_indices[n] self.continuum_fit_container.set_wavelength( index, continua.get_wavelength(n)) self.continuum_fit_container.set_flux(index, continua.get_flux(n)) # TODO: refactor self.continuum_fit_container.copy_metadata( index, continua.get_metadata(n)) self.n += 1 l_print_no_barrier("n =", self.n) l_print_no_barrier("n =", self.n)
def reduce_and_save(output_file, global_histogram, histogram, group_parameters): comm.Reduce([histogram, MPI.DOUBLE], [global_histogram, MPI.DOUBLE], op=MPI.SUM, root=0) if comm.rank == 0: # compute the median and add it to the npz file ism_spec = np.zeros(shape=global_histogram.shape[1], dtype=np.double) for i in range(ism_spec.size): ism_spec[i] = weighted.quantile( np.arange(global_histogram.shape[0]), global_histogram[:, i], 0.5) ism_spec *= float(flux_range) / num_bins ism_spec += flux_min np.savez_compressed(output_file, histogram=global_histogram, ar_wavelength=ar_wavelength, flux_range=[flux_min, flux_max], ism_spec=ism_spec, group_parameters=group_parameters)
def get_mask_list(self, plate, mjd, fiber_id): qso_tuple = (plate, mjd, fiber_id) mask_list = [] z_vi = None # if QSO is not in BAL list, return an empty list if qso_tuple in self.bal_dict: i = self.bal_dict[qso_tuple] d = self.data z_vi = d.Z_VI[i] for j in range(d.NCIV_450[i]): for line_center in self.line_centers.values(): # note that start<==>max # add a safety margin margin = 0.002 end = civ_rel_velocity_to_wavelength( line_center, z_vi, d.VMIN_CIV_450[i][j]) * (1 + margin) start = civ_rel_velocity_to_wavelength( line_center, z_vi, d.VMAX_CIV_450[i][j]) * (1 - margin) mask_list += [MaskElement(start, end)] return mask_list, z_vi
def calc_fit_power_law(delta_f_snr_bins=snr_stats_total): snr_bins = delta_f_snr_bins_helper.get_log_snr_axis() y_quantile = np.zeros_like(snr_bins) y1 = delta_f_snr_bins_helper.get_delta_f_axis() for i in range(50): y_quantile[i] = weighted.quantile(y1, delta_f_snr_bins[i], .9) mask = [np.logical_and(-0 < snr_bins, snr_bins < 3)] masked_snr_bins = snr_bins[mask] # print("x2:", masked_snr_bins) fit_params = lmfit.Parameters() fit_params.add('a', -2., min=-5, max=-1) fit_params.add('b', 1., min=0.1, max=20.) fit_params.add('c', 0.08, min=0, max=0.2) # make sure the exponent base is non-negative fit_params.add('d', 3, min=-masked_snr_bins.min(), max=5) fit_result = lmfit.minimize(fit_function, fit_params, kws={ 'data': y_quantile[mask], 'x': masked_snr_bins }) return fit_result, snr_bins, masked_snr_bins, y_quantile
def mean_transmittance_chunk(qso_record_table): start_offset = qso_record_table[0]['index'] spectra = read_spectrum_hdf5.SpectraWithMetadata( qso_record_table, settings.get_qso_spectra_hdf5()) continuum_fit_file = ContinuumFitContainerFiles(False) m = mean_transmittance.MeanTransmittance(np.arange(*z_range)) med = median_transmittance.MedianTransmittance(np.arange(*z_range)) for n in range(len(qso_record_table)): qso_spec_obj = spectra.return_spectrum(n) index = qso_spec_obj.qso_rec.index ar_fit_spectrum = continuum_fit_file.get_flux(index) if not continuum_fit_file.get_is_good_fit(index): local_mean_stats['bad_fit'] += 1 l_print_no_barrier("skipped QSO (bad fit): ", qso_spec_obj.qso_rec) continue lya_forest_transmittance_binned = qso_transmittance_binned( qso_spec_obj, ar_fit_spectrum, local_mean_stats) if lya_forest_transmittance_binned.ar_transmittance.size: # save mean and/or median according to common settings: if settings.get_enable_weighted_mean_estimator(): m.add_flux_pre_binned( lya_forest_transmittance_binned.ar_transmittance, lya_forest_transmittance_binned.ar_mask, lya_forest_transmittance_binned.ar_ivar) if settings.get_enable_weighted_median_estimator(): med.add_flux_pre_binned( lya_forest_transmittance_binned.ar_transmittance, lya_forest_transmittance_binned.ar_mask, lya_forest_transmittance_binned.ar_ivar) mean_transmittance_chunk.num_spec += 1 l_print_no_barrier("finished chunk, num spectra:", mean_transmittance_chunk.num_spec, " offset: ", start_offset) return np.vstack((m.as_np_array(), med.as_np_array())), None
def update_mean(delta_t_file): n = 0 ar_z = np.arange(1.9, 3.5, 0.0005) # weighted mean ar_delta_t_sum = np.zeros_like(ar_z) ar_delta_t_count = np.zeros_like(ar_z) ar_delta_t_weighted = np.zeros_like(ar_z) # histogram median delta_t_min, delta_t_max = (-10, 10) delta_t_num_buckets = 1000 ar_delta_t_histogram = np.zeros(shape=(ar_z.size, delta_t_num_buckets)) ar_ivar_total = np.zeros_like(ar_z) # calculate the weighted sum of the delta transmittance per redshift bin. for i in range(delta_t_file.num_spectra): ar_z_unbinned = delta_t_file.get_wavelength(i) ar_delta_t_unbinned = delta_t_file.get_flux(i) ar_ivar_unbinned = delta_t_file.get_ivar(i) if ar_z_unbinned.size > 2: f_delta_t = interpolate.interp1d(ar_z_unbinned, ar_delta_t_unbinned, kind='nearest', bounds_error=False, fill_value=0, assume_sorted=True) ar_delta_t = f_delta_t(ar_z) f_ivar = interpolate.interp1d(ar_z_unbinned, ar_ivar_unbinned, kind='nearest', bounds_error=False, fill_value=0, assume_sorted=True) ar_ivar = f_ivar(ar_z) ar_delta_t_sum += ar_delta_t ar_delta_t_weighted += ar_delta_t * ar_ivar ar_delta_t_count += ar_delta_t != 0 ar_ivar_total += ar_ivar ar_delta_t_clipped = np.clip(ar_delta_t, delta_t_min, delta_t_max) ar_delta_t_buckets = rescale(ar_delta_t_clipped, (delta_t_min, delta_t_max), (0, delta_t_num_buckets)) ar_delta_t_buckets = np.clip(ar_delta_t_buckets.astype(np.int32), 0, delta_t_num_buckets - 1) for j in range(ar_z.size): ar_delta_t_histogram[j, ar_delta_t_buckets[j]] += ar_ivar[j] if ar_ivar[j]: pass n += 1 # save intermediate result (the mean delta_t before removal) np.save( settings.get_mean_delta_t_npy(), np.vstack((ar_z, ar_delta_t_weighted, ar_ivar_total, ar_delta_t_sum, ar_delta_t_count))) ar_delta_t_median = np.zeros_like(ar_z) for i in range(ar_z.size): ar_delta_t_median[i] = weighted.median(np.arange(delta_t_num_buckets), ar_delta_t_histogram[i]) if i > 120: pass ar_delta_t_median = rescale(ar_delta_t_median, (0, delta_t_num_buckets), (delta_t_min, delta_t_max)) np.save(settings.get_median_delta_t_npy(), np.vstack((ar_z, ar_delta_t_median))) return ar_delta_t_weighted, ar_ivar_total, ar_z, n, ar_delta_t_median
def do_continuum_fit_chunk(qso_record_table): start_offset = qso_record_table[0]['index'] spectra = read_spectrum_hdf5.SpectraWithMetadata( qso_record_table, settings.get_qso_spectra_hdf5()) num_spectra = len(qso_record_table) continuum_chunk = ContinuumFitContainer(num_spectra) # DISABLED FOR NOW # use_existing_mean_transmittance = os.path.exists(settings.get_median_transmittance_npy()) and os.path.exists( # settings.get_mean_delta_t_npy()) use_existing_mean_transmittance = False median_flux_correction_func = None if use_existing_mean_transmittance: # m = mean_transmittance.MeanTransmittance.from_file(settings.get_mean_transmittance_npy()) med = median_transmittance.MedianTransmittance.from_file( settings.get_median_transmittance_npy()) # for debugging with a small data set: # ignore values with less than 20 sample points # ar_z_mean_flux, ar_mean_flux = m.get_weighted_mean_with_minimum_count(20) ar_z_mean_flux, ar_mean_flux = med.get_weighted_median_with_minimum_count( 20) def median_flux_func(ar_z): np.interp(ar_z, ar_z_mean_flux, ar_mean_flux) ar_z_mean_correction, ar_mean_correction = get_weighted_mean_from_file( ) def median_flux_correction_func(ar_z): median_flux_func(ar_z) * ( 1 - np.interp(ar_z, ar_z_mean_correction, ar_mean_correction)) for n in range(len(qso_record_table)): current_qso_data = spectra.return_spectrum(n) pre_processed_qso_data, result_string = pre_process_spectrum.apply( current_qso_data) if result_string != 'processed': # error during pre-processing. log statistics of error causes. local_stats[result_string] += 1 continue ar_wavelength = pre_processed_qso_data.ar_wavelength ar_flux = pre_processed_qso_data.ar_flux ar_ivar = pre_processed_qso_data.ar_ivar qso_rec = pre_processed_qso_data.qso_rec # set z after pre-processing, because BAL QSOs have visually inspected redshift. z = qso_rec.z assert ar_flux.size == ar_ivar.size if not ar_ivar.sum() > 0 or not np.any(np.isfinite(ar_flux)): # no useful data local_stats['empty'] += 1 continue fit_result = fit_pca.fit( ar_wavelength / (1 + z), ar_flux, ar_ivar, z, boundary_value=np.nan, mean_flux_constraint_func=median_flux_correction_func) if not fit_result.is_good_fit: local_stats['bad_fit'] += 1 l_print_no_barrier("bad fit QSO: ", qso_rec) continuum_chunk.set_wavelength(n, ar_wavelength) continuum_chunk.set_flux(n, fit_result.spectrum) # TODO: find a way to estimate error, or create a file without ivar values. continuum_chunk.set_metadata(n, fit_result.is_good_fit, fit_result.goodness_of_fit, fit_result.snr) local_stats['accepted'] += 1 l_print_no_barrier("offset =", start_offset) return continuum_chunk.as_np_array(), continuum_chunk.as_object()
ar_map_unc_0 = None ar_map_0_log = None if comm.rank == 0: ar_map_0 = hp.fitsfunc.read_map("../../data/COM_CompMap_Dust-DL07-AvMaps_2048_R2.00.fits", field=0) ar_map_unc_0 = hp.fitsfunc.read_map("../../data/COM_CompMap_Dust-DL07-AvMaps_2048_R2.00.fits", field=1) # ar_map_0_log = np.log(ar_map_0) np.clip(ar_map_unc_0, 1e-2, np.inf, ar_map_unc_0) # optionally add a mock signal to the map mock = False if mock: ar_mock = ar_map_0 nside_signal = 32 radius = hp.nside2resol(nside_signal) / 2 / np.sqrt(2) for i in range(hp.nside2npix(nside_signal)): vec1 = hp.pix2vec(nside_signal, i) mask = hp.query_disc(ar_map_nside, vec=vec1, radius=radius) ar_mock[mask] *= 100 ar_mock /= np.sqrt(100) # send the map to all other nodes ar_map_local = comm.bcast(ar_map_0) ar_map_unc_local = comm.bcast(ar_map_unc_0) # initialize correlation bins num_bins = 100 ar_product_total = np.zeros(shape=(10, num_bins)) ar_weights_total = np.zeros(shape=(10, num_bins)) ar_counts_total = np.zeros(shape=(10, num_bins))
def delta_transmittance_chunk(qso_record_table): start_offset = qso_record_table[0]['index'] spectra = read_spectrum_hdf5.SpectraWithMetadata( qso_record_table, settings.get_qso_spectra_hdf5()) continuum_fit_file = ContinuumFitContainerFiles(False) num_spectra = len(qso_record_table) delta_t = NpSpectrumContainer(False, num_spectra=num_spectra) # warning: np.ndarray is not initialized by default. zeroing manually. delta_t.zero() m = mean_transmittance.MeanTransmittance.from_file( settings.get_mean_transmittance_npy()) # m = median_transmittance.MedianTransmittance.from_file(settings.get_median_transmittance_npy()) # for debugging with a small data set: # ignore values with less than 20 sample points ar_z_mean_transmittance, ar_mean_transmittance = m.get_weighted_mean_with_minimum_count( 20) # ar_z_mean_transmittance, ar_mean_transmittance = m.get_weighted_median_with_minimum_count(20, weighted=True) remove_dla = RemoveDlaSimple() pixel_weight = pixel_weight_coefficients.PixelWeight( pixel_weight_coefficients.DEFAULT_WEIGHT_Z_RANGE) for n in range(len(qso_record_table)): qso_spec_obj = spectra.return_spectrum(n) index = qso_spec_obj.qso_rec.index if not continuum_fit_file.get_is_good_fit(index): local_delta_stats['bad_fit'] += 1 l_print_no_barrier("skipped QSO (bad fit): ", qso_spec_obj.qso_rec) continue ar_fit_spectrum = continuum_fit_file.get_flux(index) # we assume the fit spectrum uses the same wavelengths. lya_forest_transmittance = qso_transmittance( qso_spec_obj, ar_fit_spectrum, local_delta_stats, downsample_factor=settings.get_forest_downsample_factor()) ar_z = lya_forest_transmittance.ar_z if ar_z.size: # prepare the mean transmittance for the z range of this QSO ar_mean_flux_for_z_range = np.asarray( np.interp(ar_z, ar_z_mean_transmittance, ar_mean_transmittance)) # delta transmittance is the change in relative transmittance vs the mean # therefore, subtract 1. ar_delta_t = lya_forest_transmittance.ar_transmittance / ar_mean_flux_for_z_range - 1 # finish the error estimation, and save it ar_delta_t_ivar = pixel_weight.eval( lya_forest_transmittance.ar_ivar, ar_mean_flux_for_z_range * lya_forest_transmittance.ar_fit, ar_z) # simple DLA removal (without using a catalog) if settings.get_enable_simple_dla_removal(): # remove DLA regions by setting the ivar of nearby pixels to 0 ar_dla_mask = remove_dla.get_mask(ar_delta_t) if np.any(ar_dla_mask): l_print_no_barrier("DLA(s) removed from QSO: ", qso_spec_obj.qso_rec) ar_delta_t_ivar[ar_dla_mask] = 0 # ignore nan or infinite values (in case m_mean has incomplete data because of a low sample size) # Note: using wavelength field to store redshift finite_mask = np.logical_and(np.isfinite(ar_delta_t), np.isfinite(ar_delta_t_ivar)) finite_z = ar_z[finite_mask] finite_delta_t = ar_delta_t[finite_mask] finite_ivar = ar_delta_t_ivar[finite_mask] # detrend forests with large enough range in comoving coordinates: finite_distances = cd.fast_comoving_distance(finite_z) if finite_distances[-1] - finite_distances[0] > 500: delta_t_boxcar = nu_boxcar(finite_distances, finite_delta_t, lambda c: c - 300, lambda c: c + 300, weights=finite_ivar) finite_delta_t = finite_delta_t - delta_t_boxcar delta_t.set_wavelength(n, finite_z) delta_t.set_flux(n, finite_delta_t) delta_t.set_ivar(n, finite_ivar) else: # empty record pass delta_transmittance_chunk.num_spec += 1 l_print_no_barrier("finished chunk, num spectra:", delta_transmittance_chunk.num_spec, " offset: ", start_offset) return delta_t.as_np_array(), None
def fit_binned(self, pca, ar_flux_rebinned, ar_ivar_rebinned, ar_mean_flux_constraint, qso_redshift): is_good_fit = True ar_red_flux_rebinned = ar_flux_rebinned[pca.LY_A_PEAK_INDEX:] ar_red_ivar_rebinned = ar_ivar_rebinned[pca.LY_A_PEAK_INDEX:] # Suzuki 2004 normalizes flux according to 21 pixels around 1280 normalization_factor = \ ar_red_flux_rebinned[pca.LY_A_NORMALIZATION_INDEX - 10:pca.LY_A_NORMALIZATION_INDEX + 11].mean() ar_red_flux_rebinned_normalized = ar_red_flux_rebinned / float( normalization_factor) ar_full_fit = None if not np.any(ar_red_ivar_rebinned) or not np.any( np.isfinite(ar_red_ivar_rebinned)): return np.zeros_like( pca.ar_wavelength_bins), pca.ar_wavelength_bins, 1, np.inf, 0 for _ in range(3): # predict the full spectrum from the red part of the spectrum. ar_full_fit = self.fit_function(pca, ar_red_flux_rebinned_normalized, ar_red_ivar_rebinned) # restore the original flux scale ar_full_fit = ar_full_fit * normalization_factor ar_red_fit = ar_full_fit[pca.LY_A_PEAK_INDEX:] # mask 2.5 sigma absorption # suppress error when dividing by 0, because 0 ivar is already masked, so the code has no effect anyway. with np.errstate(divide='ignore', invalid='ignore'): ar_absorption_mask = ar_red_flux_rebinned - ar_red_fit < -2.5 * ( ar_red_ivar_rebinned**-0.5) # print "masked ", float(ar_absorption_mask.sum())/ar_absorption_mask.size, " of pixels in iteration ", i ar_red_ivar_rebinned[ar_absorption_mask] = 0 ar_blue_fit = ar_full_fit[:pca.LY_A_PEAK_INDEX] ar_blue_flux_rebinned = ar_flux_rebinned[:pca.LY_A_PEAK_INDEX] ar_blue_ivar_rebinned = ar_ivar_rebinned[:pca.LY_A_PEAK_INDEX] ar_blue_fit_mean_flux_rebinned = ar_mean_flux_constraint[:pca. LY_A_PEAK_INDEX] * ar_blue_fit # ignore pixels with 0 ivar ar_blue_data_mask = np.logical_and(np.isfinite(ar_blue_flux_rebinned), ar_blue_ivar_rebinned) if np.array(ar_blue_data_mask).sum() > 50: # find the optimal mean flux regulation: params = lmfit.Parameters() params.add('a_mf', value=0, min=-300, max=300) if qso_redshift > 2.4: # there are enough forest pixels for a 2nd order fit: params.add('b_mf', value=0, min=-300, max=300) result = lmfit.minimize( fcn=self.regulate_mean_flux_2nd_order_residual, params=params, args=(pca, ar_blue_flux_rebinned, ar_blue_fit_mean_flux_rebinned, ar_blue_data_mask)) # apply the 2nd order mean flux regulation to the continuum fit: ar_regulated_blue_flux = self.mean_flux_2nd_order_correction( result.params, ar_blue_fit, pca.delta_wavelength, pca.delta_wavelength_sq) else: # low redshift makes most of the forest inaccessible, # use a 1st order fit to avoid over-fitting. result = lmfit.minimize( fcn=self.regulate_mean_flux_1st_order_residual, params=params, args=(pca, ar_blue_flux_rebinned, ar_blue_fit_mean_flux_rebinned, ar_blue_data_mask)) # apply the 1st order mean flux regulation to the continuum fit: ar_regulated_blue_flux = self.mean_flux_1st_order_correction( result.params, ar_blue_fit, pca.delta_wavelength) # overwrite the original blue fit with the regulated fit. ar_full_fit[:pca.LY_A_PEAK_INDEX] = ar_regulated_blue_flux else: is_good_fit = False goodness_of_fit = self.get_goodness_of_fit( pca, ar_flux_rebinned, ar_full_fit) if is_good_fit else np.inf snr = self.get_simple_snr( ar_flux_rebinned[pca.LY_A_PEAK_INDEX:pca. RED_END_GOODNESS_OF_FIT_INDEX], ar_ivar_rebinned[pca.LY_A_PEAK_INDEX:pca. RED_END_GOODNESS_OF_FIT_INDEX]) return ar_full_fit, pca.ar_wavelength_bins, normalization_factor, goodness_of_fit, snr
def enum_spectra(qso_record_table, plate_dir_list=PLATE_DIR_DEFAULT, pre_sort=True, flag_stats=None, and_mask=AND_MASK, or_mask=OR_MASK): """ yields a QSO object from the fits files corresponding to the appropriate qso_record :type qso_record_table: table.Table :type plate_dir_list: list[string] :type pre_sort: bool :type flag_stats: Optional[FlagStats] :param and_mask: set ivar=0 according to these and-mask flags :param or_mask: set ivar=0 according to these or-mask flags :rtype: list[QSOData] """ last_fits_partial_path = None # sort by plate to avoid reopening files too many times if pre_sort: qso_record_table.sort(['plate', 'mjd', 'fiberID']) for i in qso_record_table: qso_rec = QSORecord.from_row(i) fits_partial_path = get_fits_partial_path(qso_rec) # skip reading headers and getting data objects if the filename hasn't changed if fits_partial_path != last_fits_partial_path: fits_full_path = find_fits_file(plate_dir_list, fits_partial_path) if not fits_full_path: raise Exception("Missing file:", fits_partial_path) # get header hdu_list = fits.open(fits_full_path, memmap=True) hdu0_header = hdu_list[0].header hdu1_header = hdu_list[1].header l1 = hdu1_header["NAXIS1"] c0 = hdu0_header["COEFF0"] c1 = hdu0_header["COEFF1"] l = hdu0_header["NAXIS1"] assert l1 == l, "flux and ivar dimensions must be equal" # wavelength grid counter = np.arange(0, l) o_grid = 10**(c0 + c1 * counter) # get flux_data flux_data = hdu_list[0].data ivar_data = hdu_list[1].data and_mask_data = hdu_list[2].data or_mask_data = hdu_list[3].data last_fits_partial_path = fits_partial_path if any(var is None for var in (flux_data, ivar_data, and_mask_data, or_mask_data, o_grid)): raise Exception("Unexpected uninitialized variables.") # return requested spectrum ar_flux = flux_data[qso_rec.fiberID - 1] ar_ivar = ivar_data[qso_rec.fiberID - 1] assert ar_flux.size == ar_ivar.size current_and_mask_data = np.asarray(and_mask_data[qso_rec.fiberID - 1]) current_or_mask_data = np.asarray(or_mask_data[qso_rec.fiberID - 1]) ar_effective_mask = np.logical_or(current_and_mask_data & and_mask, current_or_mask_data & or_mask) if flag_stats is not None: for bit in range(0, 32): flag_stats.flag_count[bit, 0] += (current_and_mask_data & 1).sum() flag_stats.flag_count[bit, 1] += (current_or_mask_data & 1).sum() current_and_mask_data >>= 1 current_or_mask_data >>= 1 flag_stats.pixel_count += current_and_mask_data.size # temporary: set ivar to 0 for all bad pixels ar_ivar[ar_effective_mask != 0] = 0 yield QSOData(qso_rec, o_grid, ar_flux, ar_ivar)
def profile_main(): galaxy_metadata_file_npy = settings.get_galaxy_metadata_npy() histogram_output_npz = settings.get_ism_real_median_npz() galaxy_record_table = table.Table(np.load(galaxy_metadata_file_npy)) num_extinction_bins = settings.get_num_extinction_bins() extinction_field_name = settings.get_extinction_source() # group results into extinction bins with roughly equal number of spectra. galaxy_record_table.sort([extinction_field_name]) # remove objects with unknown extinction galaxy_record_table = galaxy_record_table[np.where( np.isfinite(galaxy_record_table[extinction_field_name]))] # if comm.size > num_extinction_bins: # raise Exception('too many MPI nodes') # split the work into 'jobs' for each mpi node. # a job is defined as a single extinction bin. # the index of every extinction bin is its job number. job_sizes, job_offsets = get_chunks(num_extinction_bins, comm.size) job_start = job_offsets[comm.rank] job_end = job_start + job_sizes[comm.rank] chunk_sizes, chunk_offsets = get_chunks(len(galaxy_record_table), num_extinction_bins) for i in range(job_start, job_end): extinction_bin_start = chunk_offsets[i] extinction_bin_end = extinction_bin_start + chunk_sizes[i] extinction_bin_record_table = galaxy_record_table[ extinction_bin_start:extinction_bin_end] # this should be done before plate sort group_parameters = { 'extinction_bin_number': i, 'extinction_minimum': extinction_bin_record_table[extinction_field_name][0], 'extinction_maximum': extinction_bin_record_table[extinction_field_name][-1], 'extinction_mean': np.mean(extinction_bin_record_table[extinction_field_name]), 'extinction_median': np.median(extinction_bin_record_table[extinction_field_name]), } # sort by plate to avoid constant switching of fits files (which are per plate). extinction_bin_record_table.sort(['plate', 'mjd', 'fiberID']) base_filename, file_extension = splitext(histogram_output_npz) output_filename = '{}_{:02d}{}'.format(base_filename, i, file_extension) l_print_no_barrier('Starting extinction bin {}'.format(i)) calc_median_spectrum(extinction_bin_record_table, output_filename, group_parameters=group_parameters) l_print_no_barrier('Finished extinction bin {}'.format(i)) for _ in barrier_sleep(comm, use_yield=True): l_print_no_barrier("waiting") pass
def get_update_mask(num_updates, num_items): mask = np.zeros(num_items, dtype=bool) for i in range(num_updates): mask[int((i + 1) * num_items / num_updates) - 1] = True return mask
def profile_main(): qso_record_table = table.Table(np.load(settings.get_qso_metadata_npy())) qso_record_list = [QSORecord.from_row(i) for i in qso_record_table] qso_spectra_hdf5 = settings.get_qso_spectra_hdf5() output_spectra = Hdf5SpectrumContainer(qso_spectra_hdf5, readonly=False, create_new=False, num_spectra=MAX_SPECTRA) total_ar_x = np.array([]) total_ar_y = np.array([]) total_ar_z = np.array([]) total_ar_c = np.array([]) for n in range(len(qso_record_list)): qso_rec = qso_record_list[n] redshift = qso_rec.z # load data ar_wavelength = output_spectra.get_wavelength(n) ar_flux = output_spectra.get_flux(n) ar_ivar = output_spectra.get_ivar(n) # convert wavelength to redshift ar_redshift = ar_wavelength / lya_center - 1 # fit continuum ar_rest_wavelength = ar_wavelength / (1 + redshift) fit_result = fit_pca.fit(ar_rest_wavelength, ar_flux, ar_ivar, qso_redshift=redshift, boundary_value=np.nan, mean_flux_constraint_func=None) # transmission is only meaningful in the ly_alpha range, and also requires a valid fit for that wavelength # use the same range as in 1404.1801 (2014) forest_mask = np.logical_and(ar_wavelength > 1040 * (1 + redshift), ar_wavelength < 1200 * (1 + redshift)) fit_mask = ~np.isnan(fit_result.spectrum) effective_mask = forest_mask & fit_mask # ar_wavelength_masked = ar_wavelength[effective_mask] # ar_fit_spectrum_masked = fit_result.spectrum[effective_mask] # convert redshift to distance ar_dist = np.asarray( cd.fast_comoving_distance(ar_redshift[effective_mask])) dec = qso_rec.dec * np.pi / 180 ra = qso_rec.ra * np.pi / 180 x_unit = np.cos(dec) * np.cos(ra) y_unit = np.cos(dec) * np.sin(ra) z_unit = np.sin(dec) scale = 1 ar_x = x_unit * ar_dist * scale ar_y = y_unit * ar_dist * scale # Note: this is the geometric coordinate, not redshift ar_z = z_unit * ar_dist * scale ar_mock_forest_array = mock_forest.get_forest(ar_x, ar_y, ar_z) ar_delta_t = -ar_mock_forest_array ar_rel_transmittance = ar_delta_t + 1 # set the forest part of the spectrum to the mock forest mock_fraction = 1 ar_flux[effective_mask] = \ ar_flux[effective_mask] * (1 - mock_fraction) + \ ar_rel_transmittance * fit_result.spectrum[effective_mask] * mock_fraction if draw_graph: display_mask = ar_mock_forest_array > 0. total_ar_x = np.append(total_ar_x, ar_x[display_mask]) total_ar_y = np.append(total_ar_y, ar_y[display_mask]) total_ar_z = np.append(total_ar_z, ar_z[display_mask]) total_ar_c = np.append(total_ar_c, ar_mock_forest_array[display_mask]) # overwrite the existing forest output_spectra.set_flux(n, ar_flux) if n % 1000 == 0: print(n) if draw_graph: mlab.points3d(total_ar_x, total_ar_y, total_ar_z, total_ar_c, mode='sphere', scale_mode='vector', scale_factor=20, transparent=True, vmin=0, vmax=1, opacity=0.03) mlab.show()
def ism_transmittance_chunk(qso_record_table): start_offset = qso_record_table[0]['index'] # spectra = read_spectrum_hdf5.SpectraWithMetadata(qso_record_table, settings.get_qso_spectra_hdf5()) # continuum_fit_file = NpSpectrumContainer(True, filename=settings.get_continuum_fit_npy()) delta_transmittance_file = NpSpectrumContainer( readonly=True, filename=settings.get_delta_t_npy(), max_wavelength_count=1000) num_spectra = len(qso_record_table) ism_delta_t = NpSpectrumContainer(False, num_spectra=num_spectra) # warning: np.ndarray is not initialized by default. zeroing manually. ism_delta_t.zero() n = 0 for i in range(len(qso_record_table)): qso_rec = QSORecord.from_row(qso_record_table[i]) index = qso_rec.index # read original delta transmittance ar_redshift = delta_transmittance_file.get_wavelength(index) # ar_flux = delta_transmittance_file.get_flux(index) ar_ivar = delta_transmittance_file.get_ivar(index) # get correction to ISM # ar_flux_new, ar_ivar_new, is_corrected = pre_process_spectrum.mw_lines.apply_correction( # ar_wavelength, np.ones_like(ar_flux), ar_ivar, qso_rec.ra, qso_rec.dec) ar_wavelength = (ar_redshift + 1) * lya_center # type: np.ndarray # limit maximum bin number because higher extinction bins are not reliable max_extinction_bin = max(20, ar_extinction_levels.size) if np.isfinite(qso_rec.extinction_g): extinction_bin = int( np.round( np.interp(qso_rec.extinction_g, ar_extinction_levels, np.arange(max_extinction_bin)))) else: extinction_bin = 0 l_print_no_barrier("extinction_bin = ", extinction_bin) ar_ism_resampled = np.interp( ar_wavelength, extinction_spectra_list[extinction_bin][0], extinction_spectra_list[extinction_bin][1], left=np.nan, right=np.nan) extinction = ar_extinction_levels[extinction_bin] # rescale according to QSO extinction l_print_no_barrier(qso_rec.extinction_g, extinction) ism_scale_factor = 1. ar_flux_new = (ar_ism_resampled - 1 ) * ism_scale_factor * qso_rec.extinction_g / extinction mask = np.logical_and(np.isfinite(ar_flux_new), ar_ivar) ism_delta_t.set_wavelength(i, ar_redshift[mask]) # use reciprocal to get absorption spectrum, then subtract 1 to get the delta ism_delta_t.set_flux(i, ar_flux_new[mask]) # ism_delta_t.set_flux(i, np.ones_like(ar_flux) * qso_rec.extinction_g) # use original ivar because we are not correcting an existing spectrum ism_delta_t.set_ivar(i, ar_ivar[mask]) n += 1 l_print_no_barrier("chunk n =", n, "offset =", start_offset) return ism_delta_t.as_np_array(), None