Example #1
0
def load_reference_sample(sample_file_name, stomp_map, args):
    """Method for loading the targert object sample with known redshifts. The
    objects are masked against the requested geomometry and stored with their
    redshifts and into a STOMP.CosmoVector object. The code also returns an
    array of the indices of the reference objects from the columns requested in
    input_flags or simply counts.
    ----------------------------------------------------------------------------
    Args:
        sample_file_name: string name of the file containing the reference,
            known redshift objects. Currently only FITS is supported.
        stomp_map: STOMP.Map object specifying the geomometry of the area
            considered.
        args: ArgumentParser.parse_args object returned from input_flags.
    Returns:
        tuple: STOMP::CosmosVector, int array
    """
    print("Loading reference sample...")
    sample_data = core_utils.file_checker_loader(sample_file_name)
    reference_vect = stomp.CosmoVector()
    reference_tree_map = stomp.TreeMap(
        stomp_map.RegionResolution(), 200)
    reference_idx_array = np.ones(sample_data.shape[0], dtype=np.uint32)*-99
    for idx, obj in enumerate(sample_data):
        if obj[args.reference_redshift_name] < args.z_min or \
           obj[args.reference_redshift_name] >= args.z_max:
            # Continue if the reference object redshift is out of range.
            continue
        tmp_cang = stomp.CosmoCoordinate(
            np.double(obj[args.reference_ra_name]),
            np.double(obj[args.reference_dec_name]),
            np.double(obj[args.reference_redshift_name]), 1.0,
            stomp.AngularCoordinate.Equatorial)
        if stomp_map.Contains(tmp_cang):
            reference_vect.push_back(tmp_cang)
            reference_tree_map.AddPoint(tmp_cang)
            if args.reference_index_name is None:
                reference_idx_array[idx] = idx
            else:
                reference_idx_array[idx] = obj[args.reference_index_name]
    print("\tLoaded %i / %i reference galaxies..." %
          (reference_vect.size(), sample_data.shape[0]))
    return (reference_vect, reference_idx_array[reference_idx_array > -99],
            reference_tree_map)
Example #2
0
def load_unknown_sample(sample_file_name, stomp_map, args):
    """ Method for loading a set of objects with unknown redshifts into
    the-wizz. This function maskes the data and returns a STOMP.iTreeMap object
    which is a searchable quad tree where each object stored has a unique
    index. If a name of an index column is not specified a simple counting
    index from thestart of the file is stored.
    ----------------------------------------------------------------------------
    Args:
        sample_file_name: string name specifying the file containing the
            unknown sample. Assumed file type is FITS.
        stomp_map: STOMP.Map object specifying the geomometry of the area
            considered.
        args: ArgumentParser.parse_args object returned from input_flags.
    Returns:
        a STOMP::iTree map object
    """
    print("Loading unknown sample...")
    # TODO: This is the main bottle neck of the code. Need to make loading,
    #     masking, and creating the quadtree much faster. This may require
    #     creating a python wrapped C++ function for loading and creating
    #     a STOMP iTreeMap.
    sample_data = core_utils.file_checker_loader(sample_file_name)
    unknown_itree_map = stomp.IndexedTreeMap(
        stomp_map.RegionResolution(), 200)
    for idx, obj in enumerate(sample_data):
        tmp_iang = stomp.IndexedAngularCoordinate(
            np.double(obj[args.unknown_ra_name]),
            np.double(obj[args.unknown_dec_name]),
            idx, stomp.AngularCoordinate.Equatorial)
        if args.unknown_index_name is not None:
            tmp_iang.SetIndex(int(obj[args.unknown_index_name]))
        if stomp_map.Contains(tmp_iang):
            unknown_itree_map.AddPoint(tmp_iang)
    print("\tLoaded %i / %i unknown galaxies..." %
          (unknown_itree_map.NPoints(), sample_data.shape[0]))
    return unknown_itree_map
Example #3
0
from the_wizz import core_utils
from the_wizz import pdf_maker_utils
from the_wizz import input_flags

if __name__ == "__main__":
    print("")
    print("The-wiZZ has begun conjuring: running pdf maker...")
    # First we parse the command line for arguments as usual. See
    # input_flags.py for a full list of input arguments.
    args = input_flags.parse_input_pdf_args()
    input_flags.print_args(args)
    # Load the file containing all matched pairs of spectroscopic and
    # photometric objects.
    print("Loading unknown data...")
    unknown_data = core_utils.file_checker_loader(args.unknown_sample_file)
    # Now we figure out what kind of redshift binning we would like to have.
    # This will be one of the largest impacts on the signal to noise of the
    # measurement. Some rules of thumb are:
    #     The narrower bins are in redshift the better. You are measuring a
    # correlation, the narrower the bin size in comoving distance the more
    # correlated things will be and thus increase the amplitude. Aka use
    # Groth/Pebbles[sic] scaling to your advantage.
    #     For a spectroscopic sample that is selected for a specific redshift
    # range with few galaxies outside that range (eg DEEP2), adaptive binning
    # is recommended. This will keep a equal number spectra per redshift bin.
    # A good rule is to try to have about 100 spectra per redshift bin for max
    # signal to noise.
    #     Linear binning is provided as a curtesy and is not nesassarly
    # recommended. It will not give the best signal to noise compared to
    # adaptive and has the same draw backs as adaptive is that the bias could
Example #4
0
def collapse_ids_to_single_estimate(hdf5_data_file_name, scale_name,
                                    unknown_data, args):
    """This is the heart of the-wizz. It enables the matching of a set of
    catalog ids to the ids stored as pairs to the spectroscopic objects. The
    result of this calculation is a intermediary data product containing the
    density of unknown objects around each reference object stored in the
    PDFMaker data structure class.
    --------------------------------------------------------------------------
    Args:
        hdf5_pairs_group: hdf5 group object containing the pair ids for a fixed
            annulus.
        unknown_data: open fits data containing object ids and relivent weights
        args: ArgumentParser.parse_args object returned from
            input_flags.parse_input_pdf_args
    Returns:
        a pdf_maker class object
    """
    # First we load the the ids from the input fits data using the columns
    # names the user has provided and scale the randoms to the correct ammount.
    # Object ids are also sorted in increasing id for later binary search.

    open_hdf5_file = core_utils.file_checker_loader(hdf5_data_file_name)
    hdf5_data_grp = open_hdf5_file['data']

    # Prime our output array.
    n_reference = len(hdf5_data_grp)
    reference_unknown_array = np.empty(n_reference, dtype=np.float32)

    key_array = list(hdf5_data_grp.keys())
    pdf_maker_obj = PDFMaker(key_array, args)

    # Initialize the workers.
    loader_pool = Pool(1)
    matcher_pool = Pool(np.min((args.n_processes - 1, 1)))

    print("\tPre-loading reference data...")
    loader_result = loader_pool.imap(
        _load_pair_data,
        [(args.input_pair_hdf5_file, scale_name,
          key_array[start_idx:start_idx + args.n_reference_load_size])
         for start_idx in range(0, len(key_array),
                                args.n_reference_load_size)])

    print("\tPre-loading unknown data...")
    id_array, rand_ratio, weight_array, ave_weight = \
        _compute_region_densities_and_weights(
            unknown_data, hdf5_data_grp, args)

    # Close the hdf5 file
    open_hdf5_file.close()

    print("\tLoading reference data and starting matching loop...")
    for loader_idx, pair_data in enumerate(loader_result):

        start_idx = loader_idx * args.n_reference_load_size
        end_idx = np.min(((loader_idx + 1) * args.n_reference_load_size,
                          len(key_array)))

        print("\t\tmatching pairs: starting references %i-%i..." %
              (start_idx, end_idx))

        if args.unknown_stomp_region_name is not None:
            matcher_pool_iter = matcher_pool.imap(
                _collapse_multiplex,
                [(data_set,
                  id_array[data_set['region']],
                  weight_array[data_set['region']],
                  args.use_inverse_weighting)
                 for pair_idx, data_set in enumerate(pair_data)],
                chunksize=np.int(np.where(args.n_processes > 1,
                                          np.sqrt(len(pair_data)), 1)))
        else:
            matcher_pool_iter = matcher_pool.imap(
                _collapse_multiplex,
                [(data_set, id_array, weight_array,
                  args.use_inverse_weighting)
                 for pair_idx, data_set in enumerate(pair_data)],
                chunksize=np.int(np.where(args.n_processes > 1,
                                          np.sqrt(len(pair_data)), 1)))

        print('\t\tStoring reference data...')
        for ref_idx, ref_data in zip(range(start_idx, end_idx), pair_data):
            pdf_maker_obj.sef_reference_obj_data(ref_idx, ref_data)

        print("\t\tComputing/storing pair count...")
        for pair_idx, reference_value in enumerate(matcher_pool_iter):
            reference_unknown_array[start_idx + pair_idx] = reference_value
        # Clean up a bit.
        del pair_data
        del matcher_pool_iter
        print("\t\t\tWaiting for next load...")

    # Close the workers.
    del loader_result
    loader_pool.close()
    matcher_pool.close()
    loader_pool.join()
    matcher_pool.join()
    # Store the results in our PDFMaker class object.
    pdf_maker_obj.initialize_regions_and_densities()
    pdf_maker_obj.set_reference_unknown_array(reference_unknown_array)
    pdf_maker_obj.scale_random_points(rand_ratio, ave_weight)

    return pdf_maker_obj
Example #5
0
 print("The-wiZZ has begun conjuring: running pdf maker full sample...")
 # First we parse the command line for arguments as usual. See
 # input_flags.py for a full list of input arguments.
 args = input_flags.parse_input_pdf_args()
 input_flags.print_args(args)
 if args.unknown_weight_name is not None:
     print("WARNING: A weight name, %s has been specified for the unknown "
           "sample. This is not possible with this code. If this was a "
           "mistake, ignore this message and continue, else kill this "
           "process and use pdf_maker.py instead." %
           args.unknown_weight_name)
     print("\tContinuing...")
 # Load the file containing all matched pairs of spectroscopic and
 # photometric objects.
 print("Loading files...")
 hdf5_pair_file = core_utils.file_checker_loader(args.input_pair_hdf5_file)
 unknown_data = core_utils.file_checker_loader(args.unknown_sample_file)
 # Load the spectroscopic data from the HDF5 data file.
 print("Preloading reference data...")
 pdf_maker = pdf_maker_utils.PDFMaker(
     hdf5_pair_file, args)
 if pdf_maker.reference_redshift_array.max() < args.z_max:
     print("WARNING: requested z_max is greater than available reference "
           "redshifts.")
 # Now we figure out what kind of redshift binning we would like to have.
 # This will be one of the largest impacts on the signal to noise of the
 # measurement. Some rules of thumb are:
 #     The narrower bins are in redshift the better. You are measuring a
 # correlation, the narrower the bin size in comoving distance the more
 # correlated things will be and thus increase the amplitude. Aka use
 # Groth/Pebbles[sic] scaling to your advantage.