def test_values(): parameters = Parameters() assert abs(parameters.lya_wavelength - 1216) < 1 assert abs(parameters.lyb_wavelength - 1025) < 1 assert abs(parameters.kms_to_z(3000) - 0.01) < 1e-4 wavelength = 1000 z_qso = 3 assert (abs(wavelength - parameters.emitted_wavelengths( parameters.observed_wavelengths(wavelength, z_qso), z_qso)) < 1e-4)
def test_prior(): # test 1 filename = "spec-5309-55929-0362.fits" if not os.path.exists(filename): retrieve_raw_spec(5309, 55929, 362) # the spectrum at paper z_qso = 3.166 param = Parameters() # prepare these files by running the MATLAB scripts until build_catalog.m prior = PriorCatalog( param, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) dla_samples = DLASamplesMAT(param, prior, "data/dr12q/processed/dla_samples_a03.mat") wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) # DLA GP Model dla_gp = DLAGPMAT( param, prior, dla_samples, 3000.0, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", True, ) dla_gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) log_priors = dla_gp.log_priors(z_qso, max_dlas=4) catalog_log_priors = np.array( [-2.53774598, -4.97413739, -7.40285925, -9.74851888]) assert np.all(np.abs(log_priors - catalog_log_priors) < 1e-4)
def prepare_subdla_model(plate: int = 5309, mjd: int = 55929, fiber_id: int = 362, z_qso: float = 3.166) -> SubDLAGPMAT: """ Return a SubDLAGP instance from an input SDSS DR12 spectrum. """ filename = "spec-{}-{}-{}.fits".format(plate, mjd, str(fiber_id).zfill(4)) if not os.path.exists(filename): retrieve_raw_spec(plate, mjd, fiber_id) # the spectrum at paper param = Parameters() # prepare these files by running the MATLAB scripts until build_catalog.m prior = PriorCatalog( param, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) subdla_samples = SubDLASamplesMAT( param, prior, "data/dr12q/processed/subdla_samples.mat") wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) # DLA GP Model subdla_gp = SubDLAGPMAT( param, prior, subdla_samples, 3000.0, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", True, ) subdla_gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) return subdla_gp
def test_prior_catalog(): params = Parameters() prior = PriorCatalog( params, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) # 94892842 2.0969 20.0292 ind = prior.thing_ids == 94892842 assert np.all(prior.z_dlas[ind] == 2.0969) assert np.all(prior.log_nhis[ind] == 20.0292) # P(DLA | zQSO) prior would saturated at around 0.1 zQSO = 5 div = lambda M, N: M / N p_dla_z = div(*prior.less_ind(zQSO)) assert 0.09 < p_dla_z < 0.11
def test_effective_optical_depth(): z_qso = 4 rest_wavelengths = np.linspace(911, 1216, 500) wavelengths = Parameters.observed_wavelengths(rest_wavelengths, z_qso) total_optical_depth = effective_optical_depth(wavelengths, 3.65, 0.0023, z_qso, 31, True) assert 0 < np.exp(-total_optical_depth.sum(axis=1)).min() < 1 assert 0 < np.exp(-total_optical_depth.sum(axis=1)).max() < 1 total_optical_depth_2 = effective_optical_depth(wavelengths, 3.65, 0.0023, z_qso, 1, True) assert np.mean(np.exp(-total_optical_depth.sum(axis=1))) < np.mean( np.exp(-total_optical_depth_2.sum(axis=1))) total_optical_depth_3 = effective_optical_depth(wavelengths, 3.65, 0.0023, 2.2, 31, True) assert np.mean(np.exp(-total_optical_depth.sum(axis=1))) < np.mean( np.exp(-total_optical_depth_3.sum(axis=1)))
def test_log_likelihood_no_dla(): # test 1 filename = "spec-5309-55929-0362.fits" if not os.path.exists(filename): retrieve_raw_spec(5309, 55929, 362) # the spectrum at paper z_qso = 3.166 param = Parameters() # prepare these files by running the MATLAB scripts until build_catalog.m prior = PriorCatalog( param, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) gp = NullGPMAT( param, prior, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", ) gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) log_likelihood_no_dla = gp.log_model_evidence() print( "log p( D | z_QSO, no DLA ) : {:.5g}".format(log_likelihood_no_dla)) assert (np.abs(log_likelihood_no_dla - (-889.04809017)) < 1 ) # there is some numerical difference plt.figure(figsize=(16, 5)) plt.plot(gp.x, gp.y, label="observed flux") plt.plot(gp.rest_wavelengths, gp.mu, label="null GP") plt.plot(gp.x, gp.this_mu, label="interpolated null GP") plt.xlabel("rest wavelengths") plt.ylabel("normalised flux") plt.legend() plt.savefig("test1.pdf", format="pdf", dpi=300) plt.clf() plt.close() # test 2 filename = "spec-3816-55272-0076.fits" z_qso = 3.68457627 if not os.path.exists(filename): retrieve_raw_spec(3816, 55272, 76) # the spectrum at paper wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) log_likelihood_no_dla = gp.log_model_evidence() print( "log p( D | z_QSO, no DLA ) : {:.5g}".format(log_likelihood_no_dla)) assert np.abs(log_likelihood_no_dla - (-734.3727266)) < 1 plt.figure(figsize=(16, 5)) plt.plot(gp.x, gp.y, label="observed flux") plt.plot(gp.rest_wavelengths, gp.mu, label="null GP") plt.plot(gp.x, gp.this_mu, label="interpolated null GP") plt.xlabel("rest wavelengths") plt.ylabel("normalised flux") plt.legend() plt.savefig("test2.pdf", format="pdf", dpi=300) plt.clf() plt.close()
def test_dla_model_evidences(broadening: bool = True): # test 1 filename = "spec-5309-55929-0362.fits" if not os.path.exists(filename): retrieve_raw_spec(5309, 55929, 362) # the spectrum at paper z_qso = 3.166 param = Parameters() # prepare these files by running the MATLAB scripts until build_catalog.m prior = PriorCatalog( param, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) dla_samples = DLASamplesMAT(param, prior, "data/dr12q/processed/dla_samples_a03.mat") wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) # DLA GP Model dla_gp = DLAGPMAT( param, prior, dla_samples, 3000.0, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", broadening, ) dla_gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) tic = time.time() max_dlas = 4 log_likelihoods_dla = dla_gp.log_model_evidences(max_dlas) toc = time.time() # very time consuming: ~ 4 mins for a single spectrum without parallelized. print("spent {} mins; {} seconds".format((toc - tic) // 60, (toc - tic) % 60)) # log likelihood results from the catalog catalog_log_likelihoods_dla = np.array( [-688.91647288, -633.00070813, -634.08569242, -640.77120558]) for i in range(max_dlas): print("log p( D | z_QSO, DLA{} ) : {:.5g}; MATLAB value: {:.5g}". format(i + 1, log_likelihoods_dla[i], catalog_log_likelihoods_dla[i])) # the accuracy down to 2.5 in log scale, this needs to be investigated. assert np.all( np.abs(catalog_log_likelihoods_dla - log_likelihoods_dla) < 2.5)
def test_dla_model(broadening: bool = True): # test 1 filename = "spec-5309-55929-0362.fits" if not os.path.exists(filename): retrieve_raw_spec(5309, 55929, 362) # the spectrum at paper z_qso = 3.166 param = Parameters() # prepare these files by running the MATLAB scripts until build_catalog.m prior = PriorCatalog( param, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) dla_samples = DLASamplesMAT(param, prior, "data/dr12q/processed/dla_samples_a03.mat") wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) # DLA GP Model dla_gp = DLAGPMAT( param, prior, dla_samples, 3000.0, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", broadening, ) dla_gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) # These are the MAPs from the paper z_dlas = np.array([2.52182382, 3.03175723]) nhis = 10**np.array([20.63417494, 22.28420156]) sample_log_likelihood_dla = dla_gp.sample_log_likelihood_k_dlas( z_dlas, nhis) print("log p( D | z_QSO, zdlas, nhis ) : {:.5g}".format( sample_log_likelihood_dla)) # Build a Null model gp = NullGPMAT( param, prior, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", ) gp.set_data(rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True) log_likelihood_no_dla = gp.log_model_evidence() print( "log p( D | z_QSO, no DLA ) : {:.5g}".format(log_likelihood_no_dla)) assert sample_log_likelihood_dla > log_likelihood_no_dla
def process_qso( qso_list: List, z_qso_list: List, read_spec=read_spec.read_spec, max_dlas: int = 4, broadening: bool = True, plot_figures: bool = False, ): """ Read fits file from qso_list and process each QSO with Bayesian model selection. :param qso_list: a list of fits filenames :param z_qso_list: a list of zQSO corresponding to the spectra in the qso_list :param read_spec: a function to read the fits file. :param broadening: whether to implement the instrumental broadening in the SDSS. :param plot_figures: if True, plot the maximum a posteriori estimate of DLAs into a plot, with the likelihoods in the parameter space. A HDF5 File will be saved after running this function. (Comments from the MATLAB code) Run DLA detection algorithm on specified objects while using lower lognhi range (defined in set_lls_parameters.m) as an alternative model; Note: model_posterior(quasar_ind, :) ... = [p(no dla | D), p(lls | D), p(1 dla | D), p(2 dla | D), ...] also note that we should treat lls as no dla. For higher order of DLA models, we consider the Occam's Razor effect due to normalisation in the higher dimensions of the parameter space. We implement exp(-optical_depth) to the mean-flux µ(z) := µ * exp( - τ (1 + z)^β ) ; 1 + z = lambda_obs / lambda_lya the prior of τ and β are taken from Kim, et al. (2007). Nov 18, 2019: add all Lyman series to the effective optical depth effective_optical_depth := ∑ τ fi1 λi1 / ( f21 λ21 ) * ( 1 + z_i1 )^β where 1 + z_i1 = λobs / λ_i1 = λ_lya / λ_i1 * (1 + z_a) Dec 25, 2019: add Lyman series to the noise variance training s(z) = 1 - exp(-effective_optical_depth) + c_0 March 8, 2019: add additional Occam's razor factor between DLA models and null model: P(DLAs | D) := P(DLAs | D) / num_dla_samples """ param = Parameters() # prepare these files by running the MATLAB scripts until build_catalog.m prior = PriorCatalog( param, "data/dr12q/processed/catalog.mat", "data/dla_catalogs/dr9q_concordance/processed/los_catalog", "data/dla_catalogs/dr9q_concordance/processed/dla_catalog", ) dla_samples = DLASamplesMAT( param, prior, "data/dr12q/processed/dla_samples_a03.mat" ) subdla_samples = SubDLASamplesMAT( param, prior, "data/dr12q/processed/subdla_samples.mat" ) # initialize Bayesian model selection class, with maximum 4 DLAs and at least 1 subDLA, # which is the same as Ho-Bird-Garnett (2020). bayes = BayesModelSelect([0, 1, max_dlas], 2) # 0 DLA for null; 1 subDLA; 4 DLAs. # 2 for the location of the DLA model in the list num_quasars = len(qso_list) # allocate the arrays we want to save. Try to save similar variables as in # multi_dlas/process_qsos_multiple_dlas_meanflux.m # initialize results min_z_dlas = np.full((num_quasars,), np.nan) max_z_dlas = np.full((num_quasars,), np.nan) log_priors_no_dla = np.full((num_quasars,), np.nan) log_priors_dla = np.full((num_quasars, max_dlas), np.nan) log_likelihoods_no_dla = np.full((num_quasars,), np.nan) log_likelihoods_dla = np.full((num_quasars, max_dlas), np.nan) log_posteriors_no_dla = np.full((num_quasars,), np.nan) log_posteriors_dla = np.full((num_quasars, max_dlas), np.nan) sample_log_likelihoods_dla = np.full( (num_quasars, param.num_dla_samples, max_dlas), np.nan ) base_sample_inds = np.zeros( (num_quasars, param.num_dla_samples, max_dlas - 1), dtype=np.int32 ) # initialize lls results log_likelihoods_lls = np.full((num_quasars,), np.nan) log_posteriors_lls = np.full((num_quasars,), np.nan) log_priors_lls = np.full((num_quasars,), np.nan) sample_log_likelihoods_lls = np.full((num_quasars, param.num_dla_samples), np.nan) # save maps: add the initializations of MAP values # N * (1~k models) * (1~k MAP dlas) MAP_z_dlas = np.full((num_quasars, max_dlas, max_dlas), np.nan) MAP_log_nhis = np.full((num_quasars, max_dlas, max_dlas), np.nan) MAP_inds = np.full((num_quasars, max_dlas, max_dlas), np.nan) # save model_posteriors in real-scale not in log-scale model_posteriors = np.full((num_quasars, 1 + 1 + max_dlas,), np.nan) p_dlas = np.full((num_quasars,), np.nan) p_no_dlas = np.full((num_quasars,), np.nan) for quasar_ind, (filename, z_qso) in enumerate(zip(qso_list, z_qso_list)): tic = time.time() np.random.seed(0) wavelengths, flux, noise_variance, pixel_mask = read_spec(filename) rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso) # Null model GP : a GP model without any DLA intervening, we also don't need to # run QMC sampling on this model. gp = NullGPMAT( param, prior, "data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", ) gp.set_data( rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True ) # DLA Model GP : this is null model GP + Voigt profile, which is parameterised with # {(z_dla, logNHI)}_{i=1}^{k} parameters. k stands for maximum DLA we want to model. # we will estimate log posteriors for DLA(1), ..., DLA(k) models. dla_gp = DLAGPMAT( params=param, prior=prior, dla_samples=dla_samples, min_z_separation=3000.0, learned_file="data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", broadening=broadening, ) dla_gp.set_data( rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True ) subdla_gp = SubDLAGPMAT( params=param, prior=prior, dla_samples=subdla_samples, min_z_separation=3000.0, learned_file="data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat", broadening=broadening, ) subdla_gp.set_data( rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True ) # run bayesian model selection and get log posteriors log_posteriors = bayes.model_selection([gp, subdla_gp, dla_gp], z_qso) # at the same time, log_priors, log_likelihoods, log_posteriors will be saved to # the bayes instance. # prepare to store the values to arrays min_z_dlas[quasar_ind] = dla_gp.params.min_z_dla(wavelengths, z_qso) max_z_dlas[quasar_ind] = dla_gp.params.max_z_dla(wavelengths, z_qso) log_priors_no_dla[quasar_ind] = bayes.log_priors[0] # null model is index 0 log_priors_dla[quasar_ind, :] = bayes.log_priors[-max_dlas:] log_priors_lls[quasar_ind] = bayes.log_priors[1] # subDLA model is index 1 # null model is index 0; subDLA model is index 1 log_likelihoods_no_dla[quasar_ind] = bayes.log_likelihoods[0] log_likelihoods_dla[quasar_ind, :] = bayes.log_likelihoods[-max_dlas:] log_likelihoods_lls[quasar_ind] = bayes.log_likelihoods[1] # null model is index 0; subDLA model is index 1 log_posteriors_no_dla[quasar_ind] = bayes.log_posteriors[0] log_posteriors_dla[quasar_ind, :] = bayes.log_posteriors[-max_dlas:] log_posteriors_lls[quasar_ind] = bayes.log_posteriors[1] sample_log_likelihoods_dla[quasar_ind, :, :] = dla_gp.sample_log_likelihoods[ :, : ] base_sample_inds[quasar_ind, :, :] = dla_gp.base_sample_inds[:, :].T sample_log_likelihoods_lls[quasar_ind, :] = subdla_gp.sample_log_likelihoods[ :, 0 ] # save maps: add the initializations of MAP values # N * (1~k models) * (1~k MAP dlas) MAP_z_dla, MAP_log_nhi = dla_gp.maximum_a_posteriori() MAP_z_dlas[quasar_ind, :, :] = MAP_z_dla[:, :] MAP_log_nhis[quasar_ind, :, :] = MAP_log_nhi[:, :] # save model_posteriors in real-scale not in log-scale model_posteriors[quasar_ind, :] = bayes.model_posteriors[:] p_dlas[quasar_ind] = bayes.p_dla p_no_dlas[quasar_ind] = bayes.p_no_dla toc = time.time() # very time consuming: ~ 4 mins for a single spectrum without parallelized. print("spent {} mins; {} seconds".format((toc - tic) // 60, (toc - tic) % 60)) # [make plots] plotting the inference results into # GP mean model + sample likelihood scatter plot if plot_figures: title = filename + "; zQSO: {:.2g};\n".format(z_qso) out_filename = "spec-{}".format(str(quasar_ind).zfill(6)) make_plots( dla_gp=dla_gp, bayes=bayes, filename=out_filename, sub_dir="images", title=title ) plt.clf() plt.close() # write into HDF5 file with h5py.File("processed_qsos_multi_meanflux.h5", "w") as f: # storing default parameter settings for the model f.create_dataset( "prior_z_qso_increase", data=dla_gp.params.prior_z_qso_increase ) f.create_dataset("k", data=dla_gp.params.k) f.create_dataset( "normalization_min_lambda", data=dla_gp.params.normalization_min_lambda ) f.create_dataset( "normalization_max_lambda", data=dla_gp.params.normalization_max_lambda ) f.create_dataset("min_z_cut", data=dla_gp.params.min_z_cut) f.create_dataset("max_z_cut", data=dla_gp.params.max_z_cut) f.create_dataset("num_dla_samples", data=dla_gp.params.num_dla_samples) f.create_dataset("num_lines", data=dla_gp.params.num_lines) f.create_dataset("num_forest_lines", data=dla_gp.params.num_forest_lines) # storing the sampling variables f.create_dataset("min_z_dlas", data=min_z_dlas) f.create_dataset("max_z_dlas", data=max_z_dlas) f.create_dataset("sample_log_likelihoods_dla", data=sample_log_likelihoods_dla) f.create_dataset("base_sample_inds", data=base_sample_inds) f.create_dataset("log_priors_no_dla", data=log_priors_no_dla) f.create_dataset("log_priors_lls", data=log_priors_lls) f.create_dataset("log_priors_dla", data=log_priors_dla) f.create_dataset("log_likelihoods_no_dla", data=log_likelihoods_no_dla) f.create_dataset("log_likelihoods_lls", data=log_likelihoods_lls) f.create_dataset("log_likelihoods_dla", data=log_likelihoods_dla) f.create_dataset("log_posteriors_no_dla", data=log_posteriors_no_dla) f.create_dataset("log_posteriors_lls", data=log_posteriors_lls) f.create_dataset("log_posteriors_dla", data=log_posteriors_dla) f.create_dataset("MAP_z_dlas", data=MAP_z_dlas) f.create_dataset("MAP_log_nhis", data=MAP_log_nhis) f.create_dataset("p_dlas", data=p_dlas) f.create_dataset("p_no_dlas", data=p_no_dlas) f.create_dataset("model_posteriors", data=model_posteriors) # also save zQSOs f.create_dataset("z_qsos", data=np.array(z_qso_list)) # also save the filename list for reproducibility f.create_dataset("qso_list", data=np.array(qso_list, h5py.string_dtype(encoding="utf-8")))