def test_read_lnp_data(self): """ Read in the lnp data from a cached file and test that selected values are as expected. """ ldata = read_lnp_data(self.lnp_fname_cache) exp_keys = ["vals", "indxs"] for ckey in ldata.keys(): assert ckey in exp_keys, f"{ckey} not in lnp data expected keys" # check an entry for a single model (caching current values 20 Apr 2020) # fmt: off exp_vals = [ -56.83604431, -76.34762573, -17.55770874, -18.23323059, -10.53744507 ] exp_indxs = [14639., 15015., 296., 12636., 1336.] # fmt: on np.testing.assert_allclose( ldata["vals"][0][0:5], exp_vals, err_msg="Expected posterior (vals) values not correct", ) np.testing.assert_allclose( ldata["indxs"][0][0:5], exp_indxs, err_msg="Expected index values not correct", )
def test_get_lnp_grid_vals(self): """ Read in the lnp and sed grid data from cached files and test that selected values are as expected. """ ldata = read_lnp_data(self.lnp_fname_cache) requested_params = [ "Av", "Rv", "f_A", "M_ini", "logA", "Z", "distance" ] sdata = read_sed_data(self.seds_trim_fname_cache, param_list=requested_params) lgvals_data = get_lnp_grid_vals(sdata, ldata) # check that otherwise, the requested lgvals data is returned expected_values = { "Av": [0.0, 0.0, 0.0, 0.0, 0.0], "Rv": [2.0, 2.0, 2.0, 2.0, 2.0], "f_A": [1.0, 1.0, 1.0, 1.0, 1.0], "M_ini": [3.89416909, 3.92726111, 3.95603228, 2.04966068, 2.04999995], "logA": [6.0, 6.0, 6.0, 9.0, 9.0], "Z": [0.03, 0.03, 0.03, 0.004, 0.004], "distance": [ 783429.64276621, 783429.64276621, 783429.64276621, 783429.64276621, 783429.64276621, ], } for cname in requested_params: assert (cname in lgvals_data.keys() ), f"requsted parameter {cname} not in sed data" np.testing.assert_allclose( lgvals_data[cname][0:5, 10], expected_values[cname], err_msg=f"expected value of {cname} is not found", )
def merge_lnp( subgrid_lnp_fnames, re_run=False, output_fname_base=None, threshold=None, ): """ Merge a set of sparsely sampled log likelihood (lnp) files. It is assumed that they are for each part of a subgrid, such that a given star_# in each file corresponds to the same star_# in the other file(s). Note that this should NOT be used to combine files across source density or background bin. Parameters ---------- subgrid_lnp_fnames: list of string file names of all the lnp fits files re_run: boolean (default=False) If True, re-run the merging, even if the merged files already exist. If False, will only merge files if they don't exist. output_fname_base: string (default=None) If set, this will prepend the output lnp file name threshold : float (default=None) If set: for a given star, any lnP values below max(lnP)-threshold will be deleted Returns ------- merged_lnp_fname : string file name of the resulting lnp fits file (newly created by this function) """ # create filename if output_fname_base is None: merged_lnp_fname = "combined_lnp.fits" else: merged_lnp_fname = output_fname_base + "_lnp.fits" # check if we need to rerun if os.path.isfile(merged_lnp_fname) and (re_run is False): print(str(len(subgrid_lnp_fnames)) + " files already merged, skipping") return merged_lnp_fname # dictionaries to compile all the info merged_lnp = defaultdict(list) merged_subgrid = defaultdict(list) merged_idx = defaultdict(list) for fname in subgrid_lnp_fnames: # extract subgrid number from filename subgrid_num = [i for i in fname.split('_') if 'gridsub' in i][0][7:] # read in the SED indices and lnP values lnp_data = read_beast_data.read_lnp_data(fname, shift_lnp=False) n_lnp, n_star = lnp_data['vals'].shape # save each star's values into the master dictionary for i in range(n_star): merged_lnp['star_'+str(i)] += lnp_data['vals'][:,i].tolist() merged_idx['star_'+str(i)] += lnp_data['indxs'][:,i].tolist() merged_subgrid['star_'+str(i)] += np.full(n_lnp, int(subgrid_num)).tolist() # go through each star and remove values that are too small if threshold is not None: # keep track of how long the list of good values is good_list_len = np.zeros(n_star) # go through each star for i in range(n_star): star_label = "star_"+str(i) # good indices keep_ind = np.where( np.array(merged_lnp[star_label]) > (max(merged_lnp[star_label]) - threshold) )[0] good_list_len[i] = len(keep_ind) # save just those merged_lnp[star_label] = np.array(merged_lnp[star_label])[keep_ind].tolist() merged_idx[star_label] = np.array(merged_idx[star_label])[keep_ind].tolist() merged_subgrid[star_label] = np.array(merged_subgrid[star_label])[keep_ind].tolist() # figure out how many padded -inf/nan values need to be appended to make # each list the same length n_list_pad = np.max(good_list_len) - good_list_len else: # no list padding if there's no trimming for threshold n_list_pad = np.zeros(n_star) # write out the things in a new file with tables.open_file(merged_lnp_fname, "w") as out_table: for i in range(n_star): star_label = "star_"+str(i) star_group = out_table.create_group(star_label) star_group.create_dataset( 'idx', data=np.array(merged_idx[star_label] + n_list_pad*[np.nan]) ) star_group.create_dataset( 'lnp', data=np.array(merged_lnp[star_label] + n_list_pad*[-np.inf]) ) star_group.create_dataset( 'subgrid', data=np.array(merged_subgrid[star_label] + n_list_pad*[np.nan]) ) return merged_lnp_fname
def megabeast(megabeast_input_file, verbose=True): """ Run the MegaBEAST on each of the spatially-reordered BEAST outputs. Parameters ---------- megabeast_input_file : string Name of the file that contains settings, filenames, etc verbose : boolean (default=True) print extra info """ # read in the settings from the file mb_settings = read_megabeast_input(megabeast_input_file) # setup the megabeast model including defining the priors # - dust distribution model # - stellar populations model (later) # use nstars image to setup for each pixel nstars_image, nstars_header = fits.getdata(mb_settings["nstars_filename"], header=True) n_x, n_y = nstars_image.shape # read in the beast data that is needed by all the pixels beast_data = {} # - SED data beast_data.update( read_beast_data.read_sed_data( mb_settings["beast_seds_filename"], param_list=["Av"] # , "Rv", "f_A"] )) # - max completeness beast_data.update( read_beast_data.read_noise_data( mb_settings["beast_noise_filename"], param_list=["completeness"], )) beast_data["completeness"] = np.max(beast_data["completeness"], axis=1) # setup for output pixel_fit_status = np.full((n_x, n_y), False, dtype=bool) n_fit_params = len(mb_settings["fit_param_names"]) best_fit_images = np.zeros((n_x, n_y, n_fit_params), dtype=float) + np.nan # loop over the pixels with non-zero entries in the nstars image for i in trange(n_x, desc="x pixels"): for j in trange(n_y, desc="y pixels", leave=False): # for i in [6]: # for j in [6]: if verbose: print("working on (%i,%i)" % (i, j)) if nstars_image[i, j] >= mb_settings["min_for_fit"]: pixel_fit_status[i, j] = True # get the saved sparse likelihoods lnp_filename = mb_settings[ "lnp_file_prefix"] + "_{0}_{1}_lnp.hd5".format(j, i) lnp_data = read_beast_data.read_lnp_data( lnp_filename, nstars=nstars_image[i, j], shift_lnp=True, ) # get the completeness and BEAST model parameters for the # same grid points as the sparse likelihoods lnp_grid_vals = read_beast_data.get_lnp_grid_vals( beast_data, lnp_data) # initialize the ensemble model with the parameters used # for the saved BEAST model run results # currently only dust parameters allowed # for testing -> only Av avs = lnp_grid_vals["Av"] rvs = [3.1] # beast_data['Rv'] fAs = [1.0] # beast_data['f_A'] beast_dust_priors = PriorWeightsDust( avs, mb_settings["av_prior_model"], rvs, mb_settings["rv_prior_model"], fAs, mb_settings["fA_prior_model"], ) # standard minimization to find initial values def chi2(args): return -1.0 * lnprob(*args) result = op.minimize( chi2, [0.25, 2.0, 0.5, 0.5, 1], args=(beast_dust_priors, lnp_data, lnp_grid_vals), method="Nelder-Mead", ) best_fit_images[i, j, :] = result["x"] # print(result) # print(result['x']) # print(result['success']) # then run through MCMC to fully sample likelihood # include option not to run MCMC # output results # - best fit # - megabeast parameter 1D pPDFs # - MCMC chain master_header = nstars_header # Now, write the maps to disk # check that the directory exists if not os.path.exists("./" + mb_settings["projectname"] + "_megabeast/"): os.makedirs("./" + mb_settings["projectname"] + "_megabeast/") for k, cname in enumerate(mb_settings["fit_param_names"]): hdu = fits.PrimaryHDU(best_fit_images[:, :, k], header=master_header) # Save to FITS file hdu.writeto( "%s_megabeast/%s_%s_bestfit.fits" % (mb_settings["projectname"], mb_settings["projectname"], cname), overwrite=True, )
def fit_ensemble(beast_data, lnp_filename, beast_priormodel, nstars_expected=None): """ Run the MegaBEAST on a single set of BEAST results. Parameters ---------- beast_data : dict information about the BEAST runs including SED grid and noise model lnp_filename : string file with posteriors from BEAST fitting beast_priormodel : dict dictionary of the BEAST prior model information nstars_expected : int number of stars expected, used as a check Returns ------- fit_results : array set of best fit parameters """ # get the saved sparse likelihoods lnp_data = read_lnp_data(lnp_filename, nstars=nstars_expected, shift_lnp=True) # get the completeness and BEAST model parameters for the # same grid points as the sparse likelihoods lnp_grid_vals = get_lnp_grid_vals(beast_data, lnp_data) # compute the BEAST prior weights # needed so the BEAST posteriors updated with the MegaBEAST model # ***currently only AV ensemble model supported*** avs = lnp_grid_vals["Av"] rvs = [3.1] # beast_data['Rv'] fAs = [1.0] # beast_data['f_A'] beast_dust_priors = PriorWeightsDust( avs, beast_priormodel["AV"], rvs, beast_priormodel["RV"], fAs, beast_priormodel["fA"], ) # standard minimization to find initial values def chi2(args): return -1.0 * lnprob(*args) result = op.minimize( chi2, [0.25, 2.0, 0.5, 0.5, 1], args=(beast_dust_priors, lnp_data, lnp_grid_vals), method="Nelder-Mead", ) # next step would be to # run through MCMC to fully sample likelihood # maybe include option not to run MCMC return result["x"]
def plot_input_data(megabeast_input_file, chi2_plot=[], log_scale=False): """ Parameters ---------- megabeast_input_file : string Name of the file that contains settings, filenames, etc chi2_plot : list of floats (default=[]) Make A_V histogram(s) with chi2 less than each of the values in this list log_scale : boolean (default=False) If True, make the histogram x-axis a log scale (to visualize log-normal A_V distribution) """ # read in the settings from the file mb_settings = read_input(megabeast_input_file) # get the project name projectname = mb_settings["projectname"] # read in the beast data that is needed by all the pixels beast_data = {} # - SED data beast_data.update( read_beast_data.read_sed_data( mb_settings["beast_seds_filename"], param_list=["Av"] # , "Rv", "f_A"] )) # - max completeness beast_data.update( read_beast_data.read_noise_data( mb_settings["beast_noise_filename"], param_list=["completeness"], )) beast_data["completeness"] = np.max(beast_data["completeness"], axis=1) # read in the nstars image nstars_image, nstars_header = fits.getdata(mb_settings["nstars_filename"], header=True) # dimensions of images/plotting y_dimen = nstars_image.shape[0] x_dimen = nstars_image.shape[1] # set up multi-page figure if not log_scale: pp = PdfPages("{0}_megabeast/plot_input_data.pdf".format(projectname)) if log_scale: pp = PdfPages( "{0}_megabeast/plot_input_data_log.pdf".format(projectname)) # save the best-fit A_V best_av = [[[] for j in range(x_dimen)] for i in range(y_dimen)] best_av_chi2 = [[[] for j in range(x_dimen)] for i in range(y_dimen)] # ----------------- # Completeness vs A_V # ----------------- print("") print("Making completeness/Av plot") print("") # set up figure plt.figure(figsize=(6, 6)) plt.subplot(1, 1, 1) for i in tqdm(range(y_dimen), desc="y pixels"): for j in tqdm(range(x_dimen), desc="x pixels"): # for i in tqdm(range(int(y_dimen/3)), desc='y pixels'): # for j in tqdm(range(int(x_dimen/3)), desc='x pixels'): # for i in [0]: # for j in [12]: if nstars_image[i, j] > 20: # get info about the fits lnp_filename = mb_settings[ "lnp_file_prefix"] + "_{0}_{1}_lnp.hd5".format(j, i) lnp_data = read_beast_data.read_lnp_data( lnp_filename, nstars=nstars_image[i, j], shift_lnp=True, ) # get the completeness and BEAST model parameters for the # same grid points as the sparse likelihoods lnp_grid_vals = read_beast_data.get_lnp_grid_vals( beast_data, lnp_data) # grab the things we want to plot plot_av = lnp_grid_vals["Av"] plot_comp = lnp_grid_vals["completeness"] for n in range(nstars_image[i, j]): # plot a random subset of the AVs and completenesses if (i % 3 == 0) and (j % 3 == 0): plot_these = np.random.choice(plot_av[:, n].size, size=20, replace=False) plt.plot( plot_av[plot_these, n] + np.random.normal(scale=0.02, size=plot_these.size), plot_comp[plot_these, n], marker=".", c="black", ms=3, mew=0, linestyle="None", alpha=0.05, ) # also overplot the values for the best fit max_ind = np.where(lnp_data["vals"][:, n] == np.max( lnp_data["vals"][:, n]))[0][0] best_av[i][j].append(plot_av[max_ind, n]) best_av_chi2[i][j].append(-2 * np.max(lnp_data["vals"][:, n])) if (i % 3 == 0) and (j % 3 == 0): plt.plot( plot_av[max_ind, n] + np.random.normal(scale=0.01), plot_comp[max_ind, n], marker=".", c="magenta", ms=2, mew=0, linestyle="None", alpha=0.3, zorder=9999, ) ax = plt.gca() ax.set_xlabel(r"$A_V$") ax.set_ylabel("Completeness") pp.savefig() # ----------------- # histograms of AVs # ----------------- print("") print("Making Av Histograms") print("") # set up figure plt.figure(figsize=(x_dimen * 2, y_dimen * 2)) # flat list of A_V # https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python flat_av = [i for sublist in best_av for item in sublist for i in item] # grab the max A_V of all of them # max_av = max(flat_av) # define bins if not log_scale: uniq_av = np.unique(flat_av) gap = np.min(np.diff(uniq_av)) bins = np.arange(uniq_av[0], uniq_av[-1], gap) if log_scale: uniq_av = np.unique(np.log10(flat_av)) gap = (uniq_av[-1] - uniq_av[0]) / len(uniq_av) bins = np.arange(uniq_av[0], uniq_av[-1], gap) for i in tqdm(range(y_dimen), desc="y pixels"): for j in tqdm(range(x_dimen), desc="x pixels"): # for i in [0]: # for j in [12]: if nstars_image[i, j] > 20: # set up the subplot plt.subplot(y_dimen, x_dimen, (y_dimen - i - 1) * (x_dimen) + j + 1) # make a histogram if best_av[i][j] != []: if not log_scale: plt.hist( best_av[i][j], bins=bins.size, range=(uniq_av[0] - gap / 2, uniq_av[-1] + gap / 2), facecolor="xkcd:azure", linewidth=0.25, edgecolor="xkcd:azure", ) if log_scale: plt.hist( np.log10(best_av[i][j]), bins=bins.size, range=(uniq_av[0] - gap / 2, uniq_av[-1] + gap / 2), facecolor="xkcd:azure", linewidth=0.25, edgecolor="xkcd:azure", ) # plt.xlim(xmax=max_av) plt.suptitle(r"Best-fit $A_V$ for each pixel", fontsize=40) pp.savefig() # ----------------- # histograms of AVs with a chi2 cut # ----------------- if len(chi2_plot) > 0: print("") print("Making Av Histograms with chi^2 cut") print("") for chi2_cut in chi2_plot: # set up figure plt.figure(figsize=(x_dimen * 2, y_dimen * 2)) for i in tqdm(range(y_dimen), desc="y pixels"): for j in tqdm(range(x_dimen), desc="x pixels"): # for i in [0]: # for j in [12]: if nstars_image[i, j] > 20: # set up the subplot plt.subplot(y_dimen, x_dimen, (y_dimen - i - 1) * (x_dimen) + j + 1) # make a histogram if best_av[i][j] != []: if not log_scale: plot_av = np.array(best_av[i][j])[ np.array(best_av_chi2[i][j]) < chi2_cut] if log_scale: plot_av = np.log10( np.array(best_av[i][j])[ np.array(best_av_chi2[i][j]) < chi2_cut]) if len(plot_av) != 0: plt.hist( plot_av, bins=bins.size, range=(uniq_av[0] - gap / 2, uniq_av[-1] + gap / 2), facecolor="xkcd:azure", linewidth=0.25, edgecolor="xkcd:azure", ) plt.suptitle( r"Best-fit $A_V$ for each pixel, but only using sources with $\chi^2 < $" + str(chi2_cut), fontsize=40, ) pp.savefig() # close PDF figure pp.close()
def simulate_av_plots( megabeast_input_file, log_scale=False, input_lognormal=None, input_lognormal2=None ): """ Plot distributions of simulated AVs, and overplot the best fit lognormals Parameters ---------- megabeast_input_file : string Name of the file that contains settings, filenames, etc log_scale : boolean (default=False) If True, make the histogram x-axis a log scale (to visualize log-normal A_V distribution) input_lognormal, input_lognormal2 : dict (default=None) Set these to the original values used to create the fake data, and they will also be plotted """ # read in the settings from the file mb_settings = read_input(megabeast_input_file) # get the project name projectname = mb_settings["projectname"] # read in the beast data that is needed by all the pixels # *** this likely needs updating - probably will fail - see megabeast.py beast_data = read_sed_data( mb_settings["beast_seds_filename"], mb_settings["beast_noise_filename"], beast_params=["completeness", "Av"], ) # ,'Rv','f_A']) av_grid = np.unique(beast_data["Av"]) # also make a more finely sampled A_V grid if not log_scale: av_grid_big = np.linspace(np.min(av_grid), np.max(av_grid), 500) else: av_grid_big = np.geomspace(np.min(av_grid), np.max(av_grid), 500) # read in the nstars image nstars_image, nstars_header = fits.getdata( mb_settings["nstars_filename"], header=True ) # dimensions of images/plotting y_dimen = nstars_image.shape[0] x_dimen = nstars_image.shape[1] # read in the best fits label_list = mb_settings["fit_param_names"] best_fits = {} for label in label_list: with fits.open( "./" + projectname + "_megabeast/" + projectname + "_" + label + "_bestfit.fits" ) as hdu: best_fits[label] = hdu[0].data # set colors for plots cmap = matplotlib.cm.get_cmap("inferno") color_data = cmap(0.0) color_fit = cmap(0.5) if input_lognormal is not None: color_input = cmap(0.85) # ----------------- # plotting # ----------------- # set up figure fig = plt.figure(figsize=(x_dimen * 2, y_dimen * 2)) for i in tqdm(range(y_dimen), desc="y pixels"): for j in tqdm(range(x_dimen), desc="x pixels"): # for i in [0]: # for j in [12]: if nstars_image[i, j] > 20: # -------- data # read in the original lnp data lnp_filename = mb_settings["lnp_file_prefix"] + "_%i_%i_lnp.hd5" % ( j, i, ) lnp_data = read_lnp_data(lnp_filename, nstars_image[i, j]) lnp_vals = np.array(lnp_data["vals"]) # completeness for each of the values lnp_comp = beast_data["completeness"][lnp_data["indxs"]] # best A_V for each star best_av = [] for k in range(lnp_vals.shape[1]): vals = lnp_vals[:, k] lnp_vals[:, k] = np.log(np.exp(vals) / np.sum(np.exp(vals))) inds = lnp_data["indxs"][:, k] best_val_ind = np.where(vals == np.max(vals))[0][0] best_av.append(beast_data["Av"][inds[best_val_ind]]) best_av = np.array(best_av) # stack up some representation of what's being maximized in ensemble_model.py prob_stack = np.sum(lnp_comp * np.exp(lnp_vals), axis=1) # normalize it (since it's not clear what the numbers mean anyway) # prob_stack = prob_stack / np.sum(prob_stack) prob_stack = prob_stack / np.trapz(prob_stack, av_grid) # stack up the probabilities at each A_V # prob_stack = np.sum(np.exp(lnp_vals), axis=1) # set up the subplot plt.subplot(y_dimen, x_dimen, (y_dimen - i - 1) * (x_dimen) + j + 1) # make a histogram if not log_scale: plt.plot( av_grid, prob_stack, marker=".", ms=0, mew=0, linestyle="-", color=color_data, linewidth=4, ) if log_scale: plt.plot( np.log10(av_grid), prob_stack, marker=".", ms=0, mew=0, linestyle="-", color=color_data, linewidth=4, ) ax = plt.gca() # -------- input lognormal(s) if input_lognormal is not None: # create lognormal lognorm = _lognorm( av_grid_big, input_lognormal["max_pos"], input_lognormal["sigma"], input_lognormal["N"], ) # if there's a second lognormal if input_lognormal2 is not None: lognorm += _lognorm( av_grid_big, input_lognormal2["max_pos"], input_lognormal2["sigma"], input_lognormal2["N"], ) # normalize it # lognorm = lognorm / np.sum(lognorm) lognorm = lognorm / np.trapz(lognorm, av_grid_big) # plot it # yrange_before = ax.get_ylim() if not log_scale: plt.plot( av_grid_big, lognorm, marker=".", ms=0, mew=0, linestyle="-", color=color_input, linewidth=2, alpha=0.85, ) if log_scale: plt.plot( np.log10(av_grid_big), lognorm, marker=".", ms=0, mew=0, linestyle="-", color=color_input, linewidth=2, alpha=0.85, ) # ax.set_ylim(yrange_before) # -------- best fit # generate best fit lognorm = _two_lognorm( av_grid_big, best_fits["Av1"][i, j], best_fits["Av2"][i, j], sigma1=best_fits["sigma1"][i, j], sigma2=best_fits["sigma2"][i, j], N1=nstars_image[i, j] * (1 - 1 / (best_fits["N12_ratio"][i, j] + 1)), N2=nstars_image[i, j] / (best_fits["N12_ratio"][i, j] + 1), ) # normalize it # lognorm = lognorm / nstars_image[i,j] # lognorm = lognorm / np.sum(lognorm) lognorm = lognorm / np.trapz(lognorm, av_grid_big) # plot it yrange_before = ax.get_ylim() if not log_scale: plt.plot( av_grid_big, lognorm, marker=".", ms=0, mew=0, dashes=[3, 1.5], color=color_fit, linewidth=2, ) if log_scale: plt.plot( np.log10(av_grid_big), lognorm, marker=".", ms=0, mew=0, dashes=[3, 1.5], color=color_fit, linewidth=2, ) ax.set_ylim(yrange_before) fig.add_subplot(111, frameon=False) plt.tick_params(labelcolor="none", top="off", bottom="off", left="off", right="off") plt.grid(False) if not log_scale: plt.xlabel(r"$A_V$", size=15) else: plt.xlabel(r"Log $A_V$", size=15) plt.ylabel("PDF", size=15) plt.tight_layout() # save figure plt.savefig("./" + projectname + "_megabeast/" + projectname + "_bestfit_plot.pdf") plt.close()