def load_reference_sample(sample_file_name, stomp_map, args): """Method for loading the targert object sample with known redshifts. The objects are masked against the requested geomometry and stored with their redshifts and into a STOMP.CosmoVector object. The code also returns an array of the indices of the reference objects from the columns requested in input_flags or simply counts. ---------------------------------------------------------------------------- Args: sample_file_name: string name of the file containing the reference, known redshift objects. Currently only FITS is supported. stomp_map: STOMP.Map object specifying the geomometry of the area considered. args: ArgumentParser.parse_args object returned from input_flags. Returns: tuple: STOMP::CosmosVector, int array """ print("Loading reference sample...") sample_data = core_utils.file_checker_loader(sample_file_name) reference_vect = stomp.CosmoVector() reference_tree_map = stomp.TreeMap( stomp_map.RegionResolution(), 200) reference_idx_array = np.ones(sample_data.shape[0], dtype=np.uint32)*-99 for idx, obj in enumerate(sample_data): if obj[args.reference_redshift_name] < args.z_min or \ obj[args.reference_redshift_name] >= args.z_max: # Continue if the reference object redshift is out of range. continue tmp_cang = stomp.CosmoCoordinate( np.double(obj[args.reference_ra_name]), np.double(obj[args.reference_dec_name]), np.double(obj[args.reference_redshift_name]), 1.0, stomp.AngularCoordinate.Equatorial) if stomp_map.Contains(tmp_cang): reference_vect.push_back(tmp_cang) reference_tree_map.AddPoint(tmp_cang) if args.reference_index_name is None: reference_idx_array[idx] = idx else: reference_idx_array[idx] = obj[args.reference_index_name] print("\tLoaded %i / %i reference galaxies..." % (reference_vect.size(), sample_data.shape[0])) return (reference_vect, reference_idx_array[reference_idx_array > -99], reference_tree_map)
def load_unknown_sample(sample_file_name, stomp_map, args): """ Method for loading a set of objects with unknown redshifts into the-wizz. This function maskes the data and returns a STOMP.iTreeMap object which is a searchable quad tree where each object stored has a unique index. If a name of an index column is not specified a simple counting index from thestart of the file is stored. ---------------------------------------------------------------------------- Args: sample_file_name: string name specifying the file containing the unknown sample. Assumed file type is FITS. stomp_map: STOMP.Map object specifying the geomometry of the area considered. args: ArgumentParser.parse_args object returned from input_flags. Returns: a STOMP::iTree map object """ print("Loading unknown sample...") # TODO: This is the main bottle neck of the code. Need to make loading, # masking, and creating the quadtree much faster. This may require # creating a python wrapped C++ function for loading and creating # a STOMP iTreeMap. sample_data = core_utils.file_checker_loader(sample_file_name) unknown_itree_map = stomp.IndexedTreeMap( stomp_map.RegionResolution(), 200) for idx, obj in enumerate(sample_data): tmp_iang = stomp.IndexedAngularCoordinate( np.double(obj[args.unknown_ra_name]), np.double(obj[args.unknown_dec_name]), idx, stomp.AngularCoordinate.Equatorial) if args.unknown_index_name is not None: tmp_iang.SetIndex(int(obj[args.unknown_index_name])) if stomp_map.Contains(tmp_iang): unknown_itree_map.AddPoint(tmp_iang) print("\tLoaded %i / %i unknown galaxies..." % (unknown_itree_map.NPoints(), sample_data.shape[0])) return unknown_itree_map
from the_wizz import core_utils from the_wizz import pdf_maker_utils from the_wizz import input_flags if __name__ == "__main__": print("") print("The-wiZZ has begun conjuring: running pdf maker...") # First we parse the command line for arguments as usual. See # input_flags.py for a full list of input arguments. args = input_flags.parse_input_pdf_args() input_flags.print_args(args) # Load the file containing all matched pairs of spectroscopic and # photometric objects. print("Loading unknown data...") unknown_data = core_utils.file_checker_loader(args.unknown_sample_file) # Now we figure out what kind of redshift binning we would like to have. # This will be one of the largest impacts on the signal to noise of the # measurement. Some rules of thumb are: # The narrower bins are in redshift the better. You are measuring a # correlation, the narrower the bin size in comoving distance the more # correlated things will be and thus increase the amplitude. Aka use # Groth/Pebbles[sic] scaling to your advantage. # For a spectroscopic sample that is selected for a specific redshift # range with few galaxies outside that range (eg DEEP2), adaptive binning # is recommended. This will keep a equal number spectra per redshift bin. # A good rule is to try to have about 100 spectra per redshift bin for max # signal to noise. # Linear binning is provided as a curtesy and is not nesassarly # recommended. It will not give the best signal to noise compared to # adaptive and has the same draw backs as adaptive is that the bias could
def collapse_ids_to_single_estimate(hdf5_data_file_name, scale_name, unknown_data, args): """This is the heart of the-wizz. It enables the matching of a set of catalog ids to the ids stored as pairs to the spectroscopic objects. The result of this calculation is a intermediary data product containing the density of unknown objects around each reference object stored in the PDFMaker data structure class. -------------------------------------------------------------------------- Args: hdf5_pairs_group: hdf5 group object containing the pair ids for a fixed annulus. unknown_data: open fits data containing object ids and relivent weights args: ArgumentParser.parse_args object returned from input_flags.parse_input_pdf_args Returns: a pdf_maker class object """ # First we load the the ids from the input fits data using the columns # names the user has provided and scale the randoms to the correct ammount. # Object ids are also sorted in increasing id for later binary search. open_hdf5_file = core_utils.file_checker_loader(hdf5_data_file_name) hdf5_data_grp = open_hdf5_file['data'] # Prime our output array. n_reference = len(hdf5_data_grp) reference_unknown_array = np.empty(n_reference, dtype=np.float32) key_array = list(hdf5_data_grp.keys()) pdf_maker_obj = PDFMaker(key_array, args) # Initialize the workers. loader_pool = Pool(1) matcher_pool = Pool(np.min((args.n_processes - 1, 1))) print("\tPre-loading reference data...") loader_result = loader_pool.imap( _load_pair_data, [(args.input_pair_hdf5_file, scale_name, key_array[start_idx:start_idx + args.n_reference_load_size]) for start_idx in range(0, len(key_array), args.n_reference_load_size)]) print("\tPre-loading unknown data...") id_array, rand_ratio, weight_array, ave_weight = \ _compute_region_densities_and_weights( unknown_data, hdf5_data_grp, args) # Close the hdf5 file open_hdf5_file.close() print("\tLoading reference data and starting matching loop...") for loader_idx, pair_data in enumerate(loader_result): start_idx = loader_idx * args.n_reference_load_size end_idx = np.min(((loader_idx + 1) * args.n_reference_load_size, len(key_array))) print("\t\tmatching pairs: starting references %i-%i..." % (start_idx, end_idx)) if args.unknown_stomp_region_name is not None: matcher_pool_iter = matcher_pool.imap( _collapse_multiplex, [(data_set, id_array[data_set['region']], weight_array[data_set['region']], args.use_inverse_weighting) for pair_idx, data_set in enumerate(pair_data)], chunksize=np.int(np.where(args.n_processes > 1, np.sqrt(len(pair_data)), 1))) else: matcher_pool_iter = matcher_pool.imap( _collapse_multiplex, [(data_set, id_array, weight_array, args.use_inverse_weighting) for pair_idx, data_set in enumerate(pair_data)], chunksize=np.int(np.where(args.n_processes > 1, np.sqrt(len(pair_data)), 1))) print('\t\tStoring reference data...') for ref_idx, ref_data in zip(range(start_idx, end_idx), pair_data): pdf_maker_obj.sef_reference_obj_data(ref_idx, ref_data) print("\t\tComputing/storing pair count...") for pair_idx, reference_value in enumerate(matcher_pool_iter): reference_unknown_array[start_idx + pair_idx] = reference_value # Clean up a bit. del pair_data del matcher_pool_iter print("\t\t\tWaiting for next load...") # Close the workers. del loader_result loader_pool.close() matcher_pool.close() loader_pool.join() matcher_pool.join() # Store the results in our PDFMaker class object. pdf_maker_obj.initialize_regions_and_densities() pdf_maker_obj.set_reference_unknown_array(reference_unknown_array) pdf_maker_obj.scale_random_points(rand_ratio, ave_weight) return pdf_maker_obj
print("The-wiZZ has begun conjuring: running pdf maker full sample...") # First we parse the command line for arguments as usual. See # input_flags.py for a full list of input arguments. args = input_flags.parse_input_pdf_args() input_flags.print_args(args) if args.unknown_weight_name is not None: print("WARNING: A weight name, %s has been specified for the unknown " "sample. This is not possible with this code. If this was a " "mistake, ignore this message and continue, else kill this " "process and use pdf_maker.py instead." % args.unknown_weight_name) print("\tContinuing...") # Load the file containing all matched pairs of spectroscopic and # photometric objects. print("Loading files...") hdf5_pair_file = core_utils.file_checker_loader(args.input_pair_hdf5_file) unknown_data = core_utils.file_checker_loader(args.unknown_sample_file) # Load the spectroscopic data from the HDF5 data file. print("Preloading reference data...") pdf_maker = pdf_maker_utils.PDFMaker( hdf5_pair_file, args) if pdf_maker.reference_redshift_array.max() < args.z_max: print("WARNING: requested z_max is greater than available reference " "redshifts.") # Now we figure out what kind of redshift binning we would like to have. # This will be one of the largest impacts on the signal to noise of the # measurement. Some rules of thumb are: # The narrower bins are in redshift the better. You are measuring a # correlation, the narrower the bin size in comoving distance the more # correlated things will be and thus increase the amplitude. Aka use # Groth/Pebbles[sic] scaling to your advantage.