def merge_files(use_sd=True, nsubs=1): """ Merge all of the results from the assorted fitting sub-files (divided by source density, subgrids, or both). Parameters ---------- use_sd : boolean (default=True) If True, create source density dependent noise models (determined by finding matches to datamodel.astfile with SD info) nsubs : int (default=1) number of subgrids used for the physics model """ # if there's no SD and no subgridding, running this is unnecessary if use_sd and (nsubs == 1): print("No merging necessary") return # before doing ANYTHING, force datamodel to re-import (otherwise, any # changes within this python session will not be loaded!) importlib.reload(datamodel) # check input parameters verify_params.verify_input_format(datamodel) # get file name lists (to check if they exist and/or need to be resumed) file_dict = create_filenames.create_filenames(use_sd=use_sd, nsubs=nsubs) # - input files # photometry_files = file_dict['photometry_files'] # modelsedgrid_files = file_dict['modelsedgrid_files'] # noise_files = file_dict['noise_files'] # - output files stats_files = file_dict["stats_files"] pdf_files = file_dict["pdf_files"] # lnp_files = file_dict['lnp_files'] # - other useful info sd_sub_info = file_dict["sd_sub_info"] # gridsub_info = file_dict['gridsub_info'] # the unique sets of gridsub unique_sd_sub = [ x for i, x in enumerate(sd_sub_info) if i == sd_sub_info.index(x) ] # -------------------- # no subgrids # -------------------- if nsubs == 1: out_filebase = "{0}/{0}".format(datamodel.project) reorder_tags = [ "sd{0}_sub{1}".format(x[0], x[1]) for x in unique_sd_sub ] merge_beast_stats.merge_stats_files(stats_files, out_filebase, reorder_tag_list=reorder_tags) # -------------------- # use subgrids # -------------------- if nsubs > 1: # runs were split by source density if use_sd: # lists to save the merged file names merged_pdf_files = [] merged_stats_files = [] for i, sd_sub in enumerate(unique_sd_sub): # indices with the current sd_sub ind = [j for j, x in enumerate(sd_sub_info) if x == sd_sub] # merge the subgrid files for that SD+sub out_filebase = "{0}/SD{1}_sub{2}/{0}_SD{1}_sub{2}".format( datamodel.project, sd_sub[0], sd_sub[1]) merged_pdf1d_fname, merged_stats_fname = subgridding_tools.merge_pdf1d_stats( [pdf_files[j] for j in ind], [stats_files[j] for j in ind], re_run=False, output_fname_base=out_filebase, ) merged_pdf_files.append(merged_pdf1d_fname) merged_stats_files.append(merged_stats_fname) # merge the merged stats files out_filebase = "{0}/{0}".format(datamodel.project) reorder_tags = [ "sd{0}_sub{1}".format(x[0], x[1]) for x in unique_sd_sub ] merge_beast_stats.merge_stats_files(merged_stats_files, out_filebase, reorder_tag_list=reorder_tags) # runs weren't split by source density else: out_filebase = "{0}/{0}".format(datamodel.project) subgridding_tools.merge_pdf1d_stats(pdf_files, stats_files, output_fname_base=out_filebase)
if args.merge: modelsedgridfiles = get_modelsubgridfiles() with_fits = [s.replace(".hd5", ".fits") for s in modelsedgridfiles] pdf1dfiles = [s.replace("seds", "pdf1d") for s in with_fits] statsfiles = [s.replace("seds", "stats") for s in with_fits] output_fname_base = os.path.join(settings.project, "combined") if args.dens_bin is not None: pdf1dfiles, statsfiles = [[ os.path.join(bin_subfolder, f) for f in l ] for l in [pdf1dfiles, statsfiles]] output_fname_base = os.path.join(bin_subfolder, output_fname_base) if args.ignore_missing_subresults: # remove any missing filenames from the lists, and hope for the best def only_existing_files(file_list): return [f for f in file_list if os.path.isfile(f)] pdf1dfiles = only_existing_files(pdf1dfiles) statsfiles = only_existing_files(statsfiles) print("Merging") print(list(zip(pdf1dfiles, statsfiles))) subgridding_tools.merge_pdf1d_stats( pdf1dfiles, statsfiles, output_fname_base=output_fname_base) # print help if no arguments if not any(vars(args).values()): parser.print_help()
def merge_files(beast_settings_info, use_sd=True, nsubs=1, partial=False): """ Merge all of the results from the assorted fitting sub-files (divided by source density, subgrids, or both). If fitting is in progress but you want to check results of completed stars, set partial=True. This is only relevant when using subgrids. Parameters ---------- beast_settings_info : string or beast.tools.beast_settings.beast_settings instance if string: file name with beast settings if class: beast.tools.beast_settings.beast_settings instance use_sd : boolean (default=True) set to True if the fitting used source density bins nsubs : int (default=1) number of subgrids used for the physics model partial : boolean (default=False) If True, the output merged files will only have stars that have been run across all subgrids. If stars have only been fit in some subgrids and not others, they will be discarded in the "partial" output files. Currently only implemented for 1D PDFs and stats (not lnP) files. """ # if there's no SD and no subgridding, running this is unnecessary if (not use_sd) and (nsubs == 1): print("No merging necessary") return # process beast settings info if isinstance(beast_settings_info, str): settings = beast_settings.beast_settings(beast_settings_info) elif isinstance(beast_settings_info, beast_settings.beast_settings): settings = beast_settings_info else: raise TypeError( "beast_settings_info must be string or beast.tools.beast_settings.beast_settings instance" ) # get file name lists (to check if they exist and/or need to be resumed) file_dict = create_filenames.create_filenames(settings, use_sd=use_sd, nsubs=nsubs) # - input files # photometry_files = file_dict['photometry_files'] # modelsedgrid_files = file_dict['modelsedgrid_files'] # noise_files = file_dict['noise_files'] # - output files stats_files = file_dict["stats_files"] pdf_files = file_dict["pdf_files"] lnp_files = file_dict["lnp_files"] # - other useful info sd_sub_info = file_dict["sd_sub_info"] # gridsub_info = file_dict['gridsub_info'] # the unique sets of gridsub unique_sd_sub = [ x for i, x in enumerate(sd_sub_info) if i == sd_sub_info.index(x) ] # -------------------- # no subgrids # -------------------- if nsubs == 1: out_filebase = "{0}/{0}".format(settings.project) reorder_tags = [ "bin{0}_sub{1}".format(x[0], x[1]) for x in unique_sd_sub ] merge_beast_stats.merge_stats_files(stats_files, out_filebase, reorder_tag_list=reorder_tags) # -------------------- # use subgrids # -------------------- if nsubs > 1: # runs were split by source density if use_sd: # lists to save the merged file names merged_pdf_files = [] merged_stats_files = [] merged_lnp_files = [] for sd_sub in unique_sd_sub: # indices with the current sd_sub ind = [j for j, x in enumerate(sd_sub_info) if x == sd_sub] # merge the subgrid files for that SD+sub out_filebase = "{0}/bin{1}_sub{2}/{0}_bin{1}_sub{2}".format( settings.project, sd_sub[0], sd_sub[1]) if partial: out_filebase += "_partial" # - 1D PDFs and stats ( merged_pdf1d_fname, merged_stats_fname, ) = subgridding_tools.merge_pdf1d_stats( [pdf_files[j] for j in ind], [stats_files[j] for j in ind], re_run=False, output_fname_base=out_filebase, partial=partial, ) merged_pdf_files.append(merged_pdf1d_fname) merged_stats_files.append(merged_stats_fname) # - lnP files if not partial: merged_lnp_fname = subgridding_tools.merge_lnp( [lnp_files[j] for j in ind], re_run=False, output_fname_base=out_filebase, threshold=-10, ) merged_lnp_files.append(merged_lnp_fname) # merge the merged stats files out_filebase = "{0}/{0}".format(settings.project) reorder_tags = [ "bin{0}_sub{1}".format(x[0], x[1]) for x in unique_sd_sub ] merge_beast_stats.merge_stats_files(merged_stats_files, out_filebase, reorder_tag_list=reorder_tags) # runs weren't split by source density else: out_filebase = "{0}/{0}".format(settings.project) # - 1D PDFs and stats subgridding_tools.merge_pdf1d_stats( pdf_files, stats_files, output_fname_base=out_filebase, partial=partial, ) # - lnP files if not partial: subgridding_tools.merge_lnp( lnp_files, re_run=False, output_fname_base=out_filebase, threshold=-10, )
def test_merge_pdf1d_stats(self): """ Using cached versions of the observations, sed grid, and noise model, split the grids and do the fitting on the subgrids and original grid. Merge the results from the subgrids and compare to the results from fitting the full grid. """ ###################################### # STEP 1: GET SOME DATA TO WORK WITH # ###################################### # read in the observed data obsdata = Observations(self.obs_fname_cache, self.settings.filters, self.settings.obs_colnames) ######################################################################################### # STEP 2: SPLIT THE GRIDS AND GENERATE THE GRID INFO DICT AS IN THE SUBGRIDDING EXAMPLE # ######################################################################################### num_subgrids = 3 # Split SED grid sub_seds_trim_fnames = subgridding_tools.split_grid( self.seds_trim_fname_cache, num_subgrids, overwrite=True) # Split noise grid (a standardized function does not exist) sub_noise_trim_fnames = [] noisemodel_vals = noisemodel.get_noisemodelcat( self.noise_trim_fname_cache) slices = subgridding_tools.uniform_slices(len(noisemodel_vals["bias"]), num_subgrids) for i, slc in enumerate(slices): outname = self.noise_trim_fname_cache.replace( ".hd5", "sub{}.hd5".format(i)) with tables.open_file(outname, "w") as outfile: outfile.create_array(outfile.root, "bias", noisemodel_vals["bias"][slc]) outfile.create_array(outfile.root, "error", noisemodel_vals["error"][slc]) outfile.create_array(outfile.root, "completeness", noisemodel_vals["completeness"][slc]) sub_noise_trim_fnames.append(outname) # Collect information about the parameter rangers, to make the pdf1d bins # consistent between subgrids grid_info_dict = subgridding_tools.reduce_grid_info( sub_seds_trim_fnames, sub_noise_trim_fnames, nprocs=1, cap_unique=100) ################################################## # STEP 3: GENERATE FILENAMES AND RUN THE FITTING # ################################################## def make_gridsub_fnames(base_fname, num_subgrids, extension=".fits"): return [ base_fname.replace(extension, "gridsub{}{}".format(i, extension)) for i in range(num_subgrids) ] stats_fname = tempfile.NamedTemporaryFile(suffix=".fits").name pdf1d_fname = tempfile.NamedTemporaryFile(suffix=".fits").name lnp_fname = tempfile.NamedTemporaryFile(suffix=".hd5").name subgrid_pdf1d_fnames = make_gridsub_fnames(pdf1d_fname, num_subgrids) subgrid_stats_fnames = make_gridsub_fnames(stats_fname, num_subgrids) subgrid_lnp_fnames = make_gridsub_fnames(lnp_fname, num_subgrids, extension=".hd5") for i in range(num_subgrids): sub_noisemodel_vals = noisemodel.get_noisemodelcat( sub_noise_trim_fnames[i]) fit.summary_table_memory( obsdata, sub_noisemodel_vals, sub_seds_trim_fnames[i], threshold=-40.0, save_every_npts=100, lnp_npts=500, stats_outname=subgrid_stats_fnames[i], pdf1d_outname=subgrid_pdf1d_fnames[i], lnp_outname=subgrid_lnp_fnames[i], grid_info_dict=grid_info_dict, do_not_normalize=True, ) # The do_not_normalize option is absolutely crucial! # Now merge the results merged_pdf1d_fname, merged_stats_fname = subgridding_tools.merge_pdf1d_stats( subgrid_pdf1d_fnames, subgrid_stats_fnames) # Do a full fit also normal_stats = tempfile.NamedTemporaryFile(suffix=".fits").name normal_pdf1d = tempfile.NamedTemporaryFile(suffix=".fits").name normal_lnp = tempfile.NamedTemporaryFile(suffix=".hd5").name fit.summary_table_memory( obsdata, noisemodel_vals, self.seds_trim_fname_cache, threshold=-40.0, save_every_npts=100, lnp_npts=500, stats_outname=normal_stats, pdf1d_outname=normal_pdf1d, lnp_outname=normal_lnp, do_not_normalize=True, ) # Here, we also need to use do_not_normalize, otherwise Pmax will be # different by a factor # CHECKS tolerance = 1e-6 fits_normal = fits.open(normal_pdf1d) fits_new = fits.open(merged_pdf1d_fname) if not len(fits_new) == len(fits_normal): raise AssertionError() # A similar problem to the above will also occur here for k in range(1, len(fits_new)): qname = fits_new[k].header["EXTNAME"] np.testing.assert_allclose( fits_new[k].data, fits_normal[qname].data, rtol=tolerance, atol=tolerance, ) table_normal = Table.read(normal_stats) table_new = Table.read(merged_stats_fname) if not len(table_normal) == len(table_new): raise AssertionError() # These will normally fail, as the merging process can not be made # bit-correct due do floating point math (exacerbated by exponentials) for c in table_new.colnames: if c == "Name" or c == "RA" or c == "DEC": np.testing.assert_equal( table_normal[c], table_new[c], err_msg="column {} is not equal".format(c), ) else: np.testing.assert_allclose( table_normal[c], table_new[c], rtol=tolerance, equal_nan=True, err_msg="column {} is not close enough".format(c), )
def test_merge_pdf1d_stats(): ###################################### # STEP 1: GET SOME DATA TO WORK WITH # ###################################### vega_fname = download_rename("vega.hd5") obs_fname = download_rename("b15_4band_det_27_A.fits") noise_trim_fname = download_rename( "beast_example_phat_noisemodel_trim.grid.hd5") seds_trim_fname = download_rename("beast_example_phat_seds_trim.grid.hd5") # download cached version of fitting results # stats_fname_cache = download_rename('beast_example_phat_stats.fits') # pdf1d_fname_cache = download_rename('beast_example_phat_pdf1d.fits') # read in the observed data filters = [ "HST_WFC3_F275W", "HST_WFC3_F336W", "HST_ACS_WFC_F475W", "HST_ACS_WFC_F814W", "HST_WFC3_F110W", "HST_WFC3_F160W", ] basefilters = ["F275W", "F336W", "F475W", "F814W", "F110W", "F160W"] obs_colnames = [f.lower() + "_rate" for f in basefilters] obsdata = Observations(obs_fname, filters, obs_colnames, vega_fname=vega_fname) ######################################################################################### # STEP 2: SPLIT THE GRIDS AND GENERATE THE GRID INFO DICT AS IN THE SUBGRIDDING EXAMPLE # ######################################################################################### num_subgrids = 3 # Split SED grid sub_seds_trim_fnames = subgridding_tools.split_grid(seds_trim_fname, num_subgrids, overwrite=True) # Split noise grid (a standardized function does not exist) sub_noise_trim_fnames = [] noisemodel_vals = get_noisemodelcat(noise_trim_fname) slices = subgridding_tools.uniform_slices(len(noisemodel_vals["bias"]), num_subgrids) for i, slc in enumerate(slices): outname = noise_trim_fname.replace(".hd5", "sub{}.hd5".format(i)) with tables.open_file(outname, "w") as outfile: outfile.create_array(outfile.root, "bias", noisemodel_vals["bias"][slc]) outfile.create_array(outfile.root, "error", noisemodel_vals["error"][slc]) outfile.create_array(outfile.root, "completeness", noisemodel_vals["completeness"][slc]) sub_noise_trim_fnames.append(outname) # Collect information about the parameter rangers, to make the pdf1d bins # consistent between subgrids grid_info_dict = subgridding_tools.reduce_grid_info(sub_seds_trim_fnames, sub_noise_trim_fnames, nprocs=1, cap_unique=100) ################################################## # STEP 3: GENERATE FILENAMES AND RUN THE FITTING # ################################################## def make_gridsub_fnames(base_fname, num_subgrids, extension=".fits"): return [ base_fname.replace(extension, "gridsub{}{}".format(i, extension)) for i in range(num_subgrids) ] stats_fname = "/tmp/beast_example_phat_stats.fits" pdf1d_fname = "/tmp/beast_example_phat_pdf1d.fits" lnp_fname = "/tmp/beast_example_phat_lnp.hd5" subgrid_pdf1d_fnames = make_gridsub_fnames(pdf1d_fname, num_subgrids) subgrid_stats_fnames = make_gridsub_fnames(stats_fname, num_subgrids) subgrid_lnp_fnames = make_gridsub_fnames(lnp_fname, num_subgrids, extension=".hd5") for i in range(num_subgrids): sub_noisemodel_vals = get_noisemodelcat(sub_noise_trim_fnames[i]) fit.summary_table_memory( obsdata, sub_noisemodel_vals, sub_seds_trim_fnames[i], threshold=-40.0, save_every_npts=100, lnp_npts=60, stats_outname=subgrid_stats_fnames[i], pdf1d_outname=subgrid_pdf1d_fnames[i], lnp_outname=subgrid_lnp_fnames[i], grid_info_dict=grid_info_dict, do_not_normalize=True, ) # The do_not_normalize option is absolutely crucial! # Now merge the results merged_pdf1d_fname, merged_stats_fname = subgridding_tools.merge_pdf1d_stats( subgrid_pdf1d_fnames, subgrid_stats_fnames) # Do a full fit also normal_stats = "normal_stats.fits" normal_pdf1d = "normal_pdf1d.fits" normal_lnp = "normal_lnp.hd5" fit.summary_table_memory( obsdata, noisemodel_vals, seds_trim_fname, threshold=-40.0, save_every_npts=100, lnp_npts=60, stats_outname=normal_stats, pdf1d_outname=normal_pdf1d, lnp_outname=normal_lnp, do_not_normalize=True, ) # Here, we also need to use do_not_normalize, otherwise Pmax will be # different by a factor # CHECKS tolerance = 1e-6 print("comparing pdf1d") # fits_cache = fits.open(pdf1d_fname_cache) fits_normal = fits.open(normal_pdf1d) fits_new = fits.open(merged_pdf1d_fname) if not len(fits_new) == len(fits_normal): raise AssertionError() # A similar problem to the above will also occur here for k in range(1, len(fits_new)): qname = fits_new[k].header["EXTNAME"] print(qname) np.testing.assert_allclose(fits_new[k].data, fits_normal[qname].data, rtol=tolerance, atol=tolerance) print("comparing stats") # table_cache = Table.read(stats_fname_cache) table_normal = Table.read(normal_stats) table_new = Table.read(merged_stats_fname) if not len(table_normal) == len(table_new): raise AssertionError() # These will normally fail, as the merging process can not be made # bit-correct due do floating point math (exacerbated by exponentials) for c in table_new.colnames: print(c) if c == "Name" or c == "RA" or c == "DEC": np.testing.assert_equal( table_normal[c], table_new[c], err_msg="column {} is not equal".format(c), ) else: np.testing.assert_allclose( table_normal[c], table_new[c], rtol=tolerance, equal_nan=True, err_msg="column {} is not close enough".format(c), )
trimmed_noisemodelfile) fit.summary_table_memory(obsdata, noisemodel_vals, modelsedgrid, resume=args.resume, threshold=-10., save_every_npts=100, lnp_npts=60, stats_outname=statsfile, pdf1d_outname=pdf1dfile, grid_info_dict=grid_info_dict, lnp_outname=lnpfile, do_not_normalize=True) print('Done fitting on grid ' + trimmed_modelsedgridfile) parallel_wrapper(fit_submodel, modelsedgridfiles) new_time = time.clock() print('time to fit: ', (new_time - start_time) / 60., ' min') if args.merge: modelsedgridfiles = get_modelsubgridfiles() with_fits = [s.replace('.hd5', '.fits') for s in modelsedgridfiles] pdf1dfiles = [s.replace('seds', 'pdf1d') for s in with_fits] statsfiles = [s.replace('seds', 'stats') for s in with_fits] print('Merging') print(list(zip(pdf1dfiles, statsfiles))) subgridding_tools.merge_pdf1d_stats(pdf1dfiles, statsfiles) # print help if no arguments if not any(vars(args).values()): parser.print_help()