def merge_pdf1d_stats( subgrid_pdf1d_fnames, subgrid_stats_fnames, re_run=False, output_fname_base=None ): """ Merge a set of 1d pdfs that were generated by fits on different grids. It is necessary (and checked) that all the 1d pdfs have the same limits, bin values, and number of bins. The stats files are also combined; some values for the total grid can be calculated by simply comparing them across all the grids, others are recalculated after obtaining the new 1dpdfs. Parameters ---------- subgrid_pdf1d_fnames: list of string file names of all the pdf1d fits files subgrid_stats_fnames: list of string file names of the stats files. Should be in the same order as subgrid_pdf1d_fnames. These files are needed to help with averaging the pdf1d files as they contain the total weight of each subgrid. re_run: boolean (default=False) If True, re-run the merging, even if the merged files already exist. If False, will only merge files if they don't exist. output_fname_base: string (default=None) If set, this will prepend the output 1D PDF and stats file names Returns ------- merged_pdf1d_fname, merged_stats_fname: string, string file name of the resulting pdf1d and stats fits files (newly created by this function) """ # ------------- # before running, check if the files already exist # (unless the user wants to re-create them regardless) # 1D PDF if output_fname_base is not None: pdf1d_fname = output_fname_base + "_pdf1d.fits" else: pdf1d_fname = "combined_pdf1d.fits" # stats if output_fname_base is None: stats_fname = "combined_stats.fits" else: stats_fname = output_fname_base + "_stats.fits" if ( os.path.isfile(pdf1d_fname) and os.path.isfile(stats_fname) and (re_run is False) ): print(str(len(subgrid_pdf1d_fnames)) + " files already merged, skipping") return pdf1d_fname, stats_fname # ------------- nsubgrids = len(subgrid_pdf1d_fnames) if not len(subgrid_stats_fnames) == nsubgrids: raise AssertionError() nbins = {} with fits.open(subgrid_pdf1d_fnames[0]) as hdul_0: # Get this useful information qnames = [hdu.name for hdu in hdul_0[1:]] nbins = {q: hdul_0[q].data.shape[1] for q in qnames} bincenters = {q: hdul_0[q].data[-1, :] for q in qnames} nobs = hdul_0[qnames[0]].data.shape[0] - 1 # Check the following bin parameters for each of the other # subgrids for pdf1d_f in subgrid_pdf1d_fnames[1:]: with fits.open(pdf1d_f) as hdul: for q in qnames: pdf1d_0 = hdul_0[q].data pdf1d = hdul[q].data # the number of bins if not pdf1d_0.shape[1] == pdf1d.shape[1]: raise AssertionError() # the number of stars + 1 if not pdf1d_0.shape[0] == pdf1d.shape[0]: raise AssertionError() # the bin centers (stored in the last row of the # image) should be equal (or both nan) if not ( np.isnan(pdf1d_0[-1, 0]) and np.isnan(pdf1d[-1, 0]) or (pdf1d_0[-1, :] == pdf1d[-1, :]).all() ): raise AssertionError() # Load all the stats files stats = [Table.read(f) for f in subgrid_stats_fnames] # First, let's read the arrays of weights (each subgrid has an array # of weights, containing one weight for each source). logweight = np.zeros((nobs, nsubgrids)) for i, s in enumerate(stats): logweight[:, i] = s["total_log_norm"] # Best grid for each star (take max along grid axis) maxweight_index_per_star = np.argmax(logweight, axis=1) # Grab the max values, too max_logweight = logweight[range(len(logweight)), maxweight_index_per_star] # Get linear weights for each object/grid. By casting the maxima # into a column shape, the subtraction will be done for each column # (broadcasted). weight = np.exp(logweight - max_logweight[:, np.newaxis]) # ------------------------------------------------------------------------ # PDF1D # ------------------------------------------------------------------------ # We will try to reuse the save function defined in fit.py save_pdf1d_vals = [] for i, q in enumerate(qnames): # Prepare the ouput array save_pdf1d_vals.append(np.zeros((nobs + 1, nbins[q]))) # Copy the bin centers save_pdf1d_vals[i][-1, :] = bincenters[q] # Now, go over all the pdf1d files, and sum the weighted pdf1d values for g, pdf1d_f in enumerate(subgrid_pdf1d_fnames): with fits.open(pdf1d_f) as hdul: for i, q in enumerate(qnames): pdf1d_g = hdul[q].data[:-1, :] weight_column = weight[:, [g]] # use [g] to keep dimension save_pdf1d_vals[i][:-1, :] += pdf1d_g * weight_column # Normalize all the pdfs of the final result for i in range(len(save_pdf1d_vals)): # sum for each source in a column norms_col = np.sum(save_pdf1d_vals[i][:-1, :], axis=1, keepdims=True) # non zero mask as 1d array nonzero = norms_col[:, 0] > 0 save_pdf1d_vals[i][:-1][nonzero, :] /= norms_col[nonzero] # Save the combined 1dpdf file save_pdf1d(pdf1d_fname, save_pdf1d_vals, qnames) # ------------------------------------------------------------------------ # STATS # ------------------------------------------------------------------------ # Grid with highest Pmax, for each star pmaxes = np.zeros((nobs, nsubgrids)) for gridnr in range(nsubgrids): pmaxes[:, gridnr] = stats[gridnr]["Pmax"] max_pmax_index_per_star = pmaxes.argmax(axis=1) # Rebuild the stats stats_dict = {} for col in stats[0].colnames: suffix = col.split("_")[-1] if suffix == "Best": # For the best values, we take the 'Best' value of the grid # with the highest Pmax stats_dict[col] = [ stats[gridnr][col][e] for e, gridnr in enumerate(max_pmax_index_per_star) ] elif suffix == "Exp": # Sum and weigh the expectation values stats_dict[col] = np.zeros(nobs) total_weight_per_star = np.zeros(nobs) for gridnr, s in enumerate(stats): grid_weight_per_star = weight[:, gridnr] stats_dict[col] += stats[gridnr][col] * grid_weight_per_star total_weight_per_star += grid_weight_per_star stats_dict[col] /= total_weight_per_star elif re.compile(r"p\d{1,2}$").match(suffix): # Grab the percentile value digits = suffix[1:] p = int(digits) # Find the correct quantity (the col name without the # '_'+suffix), and its position in save_pdf1d_vals. qname = col[: -len(suffix) - 1] qindex = qnames.index(qname) # Recalculate the new percentiles from the newly obtained # 1dpdf. For each star, call the percentile function. stats_dict[col] = np.zeros(nobs) for e in range(nobs): bins = save_pdf1d_vals[qindex][-1] vals = save_pdf1d_vals[qindex][e] if vals.max() > 0: stats_dict[col][e] = percentile(bins, [p], vals)[0] else: stats_dict[col][e] = 0 elif col == "chi2min": # Take the lowest chi2 over all the grids all_chi2s = np.zeros((nobs, nsubgrids)) for gridnr, s in enumerate(stats): all_chi2s[:, gridnr] = s[col] stats_dict[col] = np.amin(all_chi2s, axis=1) elif col == "Pmax": all_pmaxs = np.zeros((nobs, nsubgrids)) for gridnr, s in enumerate(stats): all_pmaxs[:, gridnr] = s[col] stats_dict[col] = np.amax(all_pmaxs, axis=1) elif col == "total_log_norm": stats_dict[col] = np.log(weight.sum(axis=1)) + max_logweight # For anything else, just copy the values from grid 0. Except # for the index fields. Those don't make sense when using # subgrids. They might in the future though. The grid split # function and some changes to the processesing might help with # this. Actually specgrid_indx might make sense, since in my # particular case I'm splitting after the spec grid has been # created. Still leaving this out though. elif ( not col == "chi2min_indx" and not col == "Pmax_indx" and not col == "specgrid_indx" ): stats_dict[col] = stats[0][col] summary_tab = Table(stats_dict) summary_tab.write(stats_fname, overwrite=True) print("Saved combined 1dpdfs in " + pdf1d_fname) print("Saved combined stats in " + stats_fname) return pdf1d_fname, stats_fname
def Q_all_memory( prev_result, obs, sedgrid, obsmodel, qnames_in, p=[16.0, 50.0, 84.0], gridbackend="cache", max_nbins=100, stats_outname=None, pdf1d_outname=None, pdf2d_outname=None, pdf2d_param_list=None, grid_info_dict=None, lnp_outname=None, lnp_npts=None, save_every_npts=None, threshold=-40, resume=False, use_full_cov_matrix=True, do_not_normalize=False, ): """ Fit each star, calculate various fit statistics, and output them to files. All done in one function for speed and ability to resume partially completed runs. Parameters ---------- prev_result : dict previous results to include in the output summary table usually basic data on each source obs : Observation object instance observation catalog sedgrid : str or grid.SEDgrid instance model grid obsmodel : beast noisemodel instance noise model data qnames : list names of quantities p : array-like list of percentile values gridbackend : str or grid.GridBackend backend to use to load the grid if necessary (memory, cache, hdf) (see beast.core.grid) max_nbins : int (default=100) maxiumum number of bins to use for the 1D likelihood calculations save_every_npts : int set to save the files below (if set) every n stars a requirement for recovering from partially complete runs resume : bool set to designate this run is resuming a partially complete run use_full_cov_matrix : bool set to use the full covariance matrix if it is present in the noise model file stats_outname : str set to output the stats file into a FITS file with extensions pdf1d_outname : str set to output the 1D PDFs into a FITS file with extensions pdf2d_outname : str set to output the 2D PDFs into a FITS file with extensions pdf2d_param_list : list of strs or None set to the parameters for which to make the 2D PDFs grid_info_dict : dict Set to override the mins/maxes of the 1dpdfs, and the number of unique values lnp_outname : str set to output the sparse likelihoods into a (usually HDF5) file threshold : float value above which to use/save for the lnps (defines the sparse likelihood) lnp_npts : int set to a number to output a random sampling of the lnp points above the threshold. Otherwise, the full sparse likelihood is output. do_not_normalize: bool Do not normalize the prior weights before applying them. This should have no effect on the final outcome when using only a single grid, but is essential when using the subgridding approach. Returns ------- N/A """ if type(sedgrid) == str: g0 = grid.SEDGrid(sedgrid, backend=gridbackend) else: g0 = sedgrid # remove weights that are less than zero (g0_indxs, ) = np.where(g0["weight"] > 0.0) g0_weights = np.log(g0["weight"][g0_indxs]) if not do_not_normalize: # this variable used on the next line, so is used regardless of what flake8 says g0_weights_sum = np.log(g0["weight"][g0_indxs].sum()) # noqa: E302 g0_weights = numexpr.evaluate("g0_weights - g0_weights_sum") if len(g0["weight"]) != len(g0_indxs): print("some zero weight models exist") print("orig/g0_indxs", len(g0["weight"]), len(g0_indxs)) # get the model SEDs if hasattr(g0.seds, "read"): _seds = g0.seds.read() else: _seds = g0.seds # links to errors and biases ast_error = obsmodel["error"] ast_bias = obsmodel["bias"] # if the ast file includes the full covariance matrices, make links full_cov_mat = False if (use_full_cov_matrix & ("q_norm" in obsmodel.keys()) & ("icov_diag" in obsmodel.keys()) & ("icov_offdiag" in obsmodel.keys())): full_cov_mat = True ast_q_norm = obsmodel["q_norm"] ast_icov_diag = obsmodel["icov_diag"] two_ast_icov_offdiag = 2.0 * obsmodel["icov_offdiag"] else: ast_ivar = 1.0 / np.asfortranarray(ast_error)**2 if full_cov_mat: print("using full covariance matrix") else: print("not using full covariance matrix") # number of observed SEDs to fit nobs = len(obs) # augment the qnames to include the *full* model SED # by this it means the physical model flux plus the noise model bias term qnames = qnames_in filters = sedgrid.filters for i, cfilter in enumerate(filters): qnames.append("symlog" + cfilter + "_wd_bias") # create the full model fluxes for later use # save as symmetric log, since the fluxes can be negative model_seds_with_bias = np.asfortranarray(_seds + ast_bias) # full_model_flux = np.sign(logtempseds) * np.log10(1 + np.abs(logtempseds * math.log(10))) full_model_flux = (np.sign(model_seds_with_bias) * np.log1p(np.abs(model_seds_with_bias * math.log(10))) / math.log(10)) # setup the arrays to temp store the results n_qnames = len(qnames) n_pers = len(p) best_vals = np.zeros((nobs, n_qnames)) exp_vals = np.zeros((nobs, n_qnames)) per_vals = np.zeros((nobs, n_qnames, n_pers)) chi2_vals = np.zeros(nobs) chi2_indx = np.zeros(nobs) lnp_vals = np.zeros(nobs) lnp_indx = np.zeros(nobs) best_specgrid_indx = np.zeros(nobs) total_log_norm = np.zeros(nobs) # variable to save the lnp files save_lnp_vals = [] # setup the mapping for the 1D PDFs fast_pdf1d_objs = [] save_pdf1d_vals = [] # make 1D PDF objects for qname in qnames: # get bin properties qname_vals, nbins, logspacing, minval, maxval = setup_param_bins( qname, max_nbins, g0, full_model_flux, filters, grid_info_dict) # generate the fast 1d pdf mapping _tpdf1d = pdf1d(qname_vals, nbins, logspacing=logspacing, minval=minval, maxval=maxval) fast_pdf1d_objs.append(_tpdf1d) # setup the arrays to save the 1d PDFs save_pdf1d_vals.append(np.zeros((nobs + 1, nbins))) save_pdf1d_vals[-1][-1, :] = _tpdf1d.bin_vals # if chosen, make 2D PDFs if pdf2d_outname is not None: # setup the 2D PDFs _pdf2d_params = [ qname for qname in qnames if qname in pdf2d_param_list and len(np.unique(g0[qname])) > 1 ] _n_params = len(_pdf2d_params) pdf2d_qname_pairs = [ _pdf2d_params[i] + "+" + _pdf2d_params[j] for i in range(_n_params) for j in range(i + 1, _n_params) ] fast_pdf2d_objs = [] save_pdf2d_vals = [] # make 2D PDF objects for qname_pair in pdf2d_qname_pairs: qname_1, qname_2 = qname_pair.split("+") # get bin properties ( qname_vals_p1, nbins_p1, logspacing_p1, minval_p1, maxval_p1, ) = setup_param_bins(qname_1, max_nbins, g0, full_model_flux, filters, grid_info_dict) ( qname_vals_p2, nbins_p2, logspacing_p2, minval_p2, maxval_p2, ) = setup_param_bins(qname_2, max_nbins, g0, full_model_flux, filters, grid_info_dict) # make 2D PDF _tpdf2d = pdf2d( qname_vals_p1, qname_vals_p2, nbins_p1, nbins_p2, logspacing_p1=logspacing_p1, logspacing_p2=logspacing_p2, minval_p1=minval_p1, maxval_p1=maxval_p1, minval_p2=minval_p2, maxval_p2=maxval_p2, ) fast_pdf2d_objs.append(_tpdf2d) # arrays for the PDFs and bins save_pdf2d_vals.append(np.zeros((nobs + 2, nbins_p1, nbins_p2))) save_pdf2d_vals[-1][-2, :, :] = np.tile(_tpdf2d.bin_vals_p1, (nbins_p2, 1)).T save_pdf2d_vals[-1][-1, :, :] = np.tile(_tpdf2d.bin_vals_p2, (nbins_p1, 1)) # if this is a resume job, read in the already computed stats and # fill the variables # also - find the start position for the resumed run if resume: stats_table = Table.read(stats_outname) for k, qname in enumerate(qnames): best_vals[:, k] = stats_table["{0:s}_Best".format(qname)] exp_vals[:, k] = stats_table["{0:s}_Exp".format(qname)] for i, pval in enumerate(p): per_vals[:, k, i] = stats_table["{0:s}_p{1:d}".format( qname, int(pval))] chi2_vals = stats_table["chi2min"] chi2_indx = stats_table["chi2min_indx"] lnp_vals = stats_table["Pmax"] lnp_indx = stats_table["Pmax_indx"] best_specgrid_indx = stats_table["specgrid_indx"] (indxs, ) = np.where(stats_table["Pmax"] != 0.0) start_pos = max(indxs) + 1 print("resuming run with start indx = " + str(start_pos) + " out of " + str(len(stats_table["Pmax"]))) # read in the already computed 1D PDFs if pdf1d_outname is not None: print("restoring the already computed 1D PDFs from " + pdf1d_outname) with fits.open(pdf1d_outname) as hdulist: for k in range(len(qnames)): save_pdf1d_vals[k] = hdulist[k + 1].data # read in the already computed 2D PDFs if pdf2d_outname is not None: print("restoring the already computed 2D PDFs from " + pdf2d_outname) with fits.open(pdf2d_outname) as hdulist: for k in range(len(pdf2d_qname_pairs)): save_pdf2d_vals[k] = hdulist[k + 1].data else: start_pos = 0 # setup a new lnp file if lnp_outname is not None: outfile = tables.open_file(lnp_outname, "w") # Save wavelengths in root, remember #n_stars = root._v_nchildren -1 outfile.create_array(outfile.root, "grid_waves", g0.lamb[:]) filters = obs.getFilters() outfile.create_array(outfile.root, "obs_filters", filters[:]) outfile.close() # loop over the objects and get all the requested quantities g0_specgrid_indx = g0["specgrid_indx"] _p = np.asarray(p, dtype=float) it = tqdm( islice(obs.enumobs(), int(start_pos), None), total=len(obs) - start_pos, desc="Calculating Lnp/Stats", ) for e, obj in it: # calculate the full nD posterior (sed) = obj cur_mask = sed == 0 # need an alternate way to generate the mask as zeros can be # valid values in the observed SED (KDG 29 Jan 2016) # currently, set mask to False always cur_mask[:] = False if full_cov_mat: (lnp, chi2) = N_covar_logLikelihood( sed, model_seds_with_bias, ast_q_norm, ast_icov_diag, two_ast_icov_offdiag, lnp_threshold=abs(threshold), ) else: (lnp, chi2) = N_logLikelihood_NM( sed, model_seds_with_bias, ast_ivar, mask=cur_mask, lnp_threshold=abs(threshold), ) lnp = lnp[g0_indxs] chi2 = chi2[g0_indxs] # lnp = numexpr.evaluate('lnp + g0_weights') lnp += g0_weights # multiply by the prior weights (sum in log space) (indx, ) = np.where((lnp - max(lnp[np.isfinite(lnp)])) > threshold) # now generate the sparse likelihood (remove later if this works # by updating code below) # checked if changing to the full likelihood speeds things up # - the answer is no # and is likely related to the switch here to the sparse # likelihood for the weight calculation lnps = lnp[indx] chi2s = chi2[indx] # log_norm = np.log(getNorm_lnP(lnps)) # if not np.isfinite(log_norm): # log_norm = lnps.max() log_norm = lnps.max() weights = np.exp(lnps - log_norm) # normalize the weights make sure they sum to one # needed for np.random.choice weight_sum = np.sum(weights) weights /= weight_sum # save the current set of lnps if lnp_outname is not None: if lnp_npts is not None: if lnp_npts < len(indx): rindx = np.random.choice(indx, size=lnp_npts, replace=False) if lnp_npts >= len(indx): rindx = indx else: rindx = indx save_lnp_vals.append([ e, np.array(g0_indxs[rindx], dtype=np.int64), np.array(lnp[rindx], dtype=np.float32), np.array(chi2[rindx], dtype=np.float32), np.array([sed]).T, ]) # To merge the stats for different subgrids, we need the total # weight of a grid, which is sum(exp(lnps)). Since sum(exp(lnps # - log_norm - log(weight_sum))) = 1, the relative weight of # each subgrid will be exp(log_norm + log(weight_sum)). # Therefore, we also store the following quantity: total_log_norm[e] = log_norm + np.log(weight_sum) # index to the full model grid for the best fit values best_full_indx = g0_indxs[indx[weights.argmax()]] # index to the spectral grid best_specgrid_indx[e] = g0_specgrid_indx[best_full_indx] # goodness of fit quantities chi2_vals[e] = chi2s.min() chi2_indx[e] = g0_indxs[indx[chi2s.argmin()]] lnp_vals[e] = lnps.max() lnp_indx[e] = best_full_indx # calculate quantities for individual parameters: # best value, expectation value, 1D PDF, percentiles for k, qname in enumerate(qnames): if "_bias" in qname: fname = (qname.replace("_wd_bias", "")).replace("symlog", "") q = full_model_flux[:, filters.index(fname)] else: q = g0[qname] # best value best_vals[e, k] = q[best_full_indx] # expectation value exp_vals[e, k] = expectation(q[g0_indxs[indx]], weights=weights) # percentile values pdf1d_bins, pdf1d_vals = fast_pdf1d_objs[k].gen1d( g0_indxs[indx], weights) save_pdf1d_vals[k][e, :] = pdf1d_vals if pdf1d_vals.max() > 0: # remove normalization to allow for post processing with # different distance runs (needed for the SMIDGE-SMC) # pdf1d_vals /= pdf1d_vals.max() per_vals[e, k, :] = percentile(pdf1d_bins, _p, weights=pdf1d_vals) else: per_vals[e, k, :] = [0.0, 0.0, 0.0] # calculate 2D PDFs for the subset of parameter pairs if pdf2d_outname is not None: for k in range(len(pdf2d_qname_pairs)): save_pdf2d_vals[k][e, :, :] = fast_pdf2d_objs[k].gen2d( g0_indxs[indx], weights) # incremental save (useful if job dies early to recover most # of the computations) if save_every_npts is not None: if (e > 0) & (e % save_every_npts == 0): # save the 1D PDFs if pdf1d_outname is not None: save_pdf1d(pdf1d_outname, save_pdf1d_vals, qnames) # save the 2D PDFs if pdf2d_outname is not None: save_pdf2d(pdf2d_outname, save_pdf2d_vals, pdf2d_qname_pairs) # save the stats/catalog if stats_outname is not None: save_stats( stats_outname, prev_result, best_vals, exp_vals, per_vals, chi2_vals, chi2_indx, lnp_vals, lnp_indx, best_specgrid_indx, total_log_norm, qnames, p, ) # save the lnps if lnp_outname is not None: save_lnp(lnp_outname, save_lnp_vals) save_lnp_vals = [] # do the final save of everything (or the last set for the lnp values) # save the 1D PDFs if pdf1d_outname is not None: save_pdf1d(pdf1d_outname, save_pdf1d_vals, qnames) # save the 2D PDFs if pdf2d_outname is not None: save_pdf2d(pdf2d_outname, save_pdf2d_vals, pdf2d_qname_pairs) # save the stats/catalog if stats_outname is not None: save_stats( stats_outname, prev_result, best_vals, exp_vals, per_vals, chi2_vals, chi2_indx, lnp_vals, lnp_indx, best_specgrid_indx, total_log_norm, qnames, p, ) # save the lnps if lnp_outname is not None: save_lnp(lnp_outname, save_lnp_vals)