Ejemplo n.º 1
0
def create_test_input_data(fname_in=default_fname_in,
                           haloID_field="ID",
                           NHalos_test=10000):
    """
    Creates a test data set from the user supplied input data.

    Copies over a specified number of halos (Default 10,000) to perform the
    testin on.

    Note: We copy entire snapshots over meaning that halo counts will not be
    exact.  If the first snapshot with halos has 6,000 halos and the second
    has 7,000, our testing file will contain 13,000 halos.

    If the user asks to test on more halos than there are in the data file
    we raise a RuntimeError.

    Parameters
    ----------

    fname_in: String. Default: `<test_directory>/test_data.hdf5`.
        Path to the HDF5 trees we're creating the data set from.

    haloID_field: String. Default: 'ID'.
        Field name within the HDF5 file that corresponds to the unique halo ID.

    NHalos_test: Integer. Default: 10000.
        The number of halos that will be copied into the test data set.

    Returns
    ----------

    default_user_fname_in: String.
        The path to the small copied data file.
    """

    with h5py.File(fname_in, "r") as f_in, \
         h5py.File(default_user_fname_in, "w") as f_out:
        NHalos = 0

        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

        for snap_key in Snap_Keys:
            if len(f_in[snap_key][haloID_field]) == 0:
                continue

            cmn.copy_group(f_in, f_out, snap_key)
            NHalos += len(f_in[snap_key][haloID_field])

            if NHalos >= NHalos_test:
                break

    if NHalos < NHalos_test:
        print("Your supplied data file did not contain enough halos to test.")
        print("Your file contained {0} halos whereas you specified to run "
              "on {1} halos.".format(NHalos, NHalos_test))
        print("Either lower the number of halos to test on (--Nhalos_test) or "
              "use other data.")
        raise RuntimeError

    return default_user_fname_in
Ejemplo n.º 2
0
def forest_sorter(
    fname_in,
    fname_out,
    haloID_field="ID",
    sort_fields=["ForestID", "hostHaloID", "Mass_200mean"],
    sort_direction=[1, 1, -1],
    ID_fields=["Head", "Tail", "RootHead", "RootTail", "ID", "hostHaloID"],
    index_mult_factor=int(1e12)):
    """
    Sorts and saves a HDF5 tree file on the specified sort fields.  The IDs of
    the halos are assume to use the index within the data file and hence will
    be updated to reflect the sorted order.

    Parameters
    ----------

    fname_in, fname_out : String
        Path to the input HDF5 trees and path to where the sorted trees will be
        saved.

    haloID_field : String, optional
        Field name within the HDF5 file that corresponds to the unique halo ID.

    sort_fields : List of strings, optional
        The HDF5 field names that the sorting will be performed on. The entries
        are ordered such that the first field will be the outer-most sort and
        the last field will be the inner-most sort.

    sort_direction : List of integers, optional
        Specifies the direction in which the sorting will occur for each
        ``sort_field`` entry. 1 corresponds to ascending, -1 to descending.

    ID_fields : List of strings, optional
        The HDF5 field names that correspond to properties that use halo IDs.
        As the halo IDs are updated to reflect the new sort order, these fields
        must also be updated.

    index_mult_factor: Integer, optional
        Multiplication factor to generate a temporally unique halo ID.

    Returns
    ----------

    None.

    Notes
    ----------

    The default parameters are chosen to match the ASTRO3D Genesis trees as
    produced by VELOCIraptor + Treefrog.
    """

    print("")
    print("=================================")
    print("Running Forest Sorter")
    print("Input Unsorted Trees: {0}".format(fname_in))
    print("Output Sorted Trees: {0}".format(fname_out))
    print("Sort Fields: {0}".format(sort_fields))
    print("Sort Direction: {0}".format(sort_direction))
    print("Index Mult Factor: {0}".format(index_mult_factor))
    print("=================================")
    print("")

    with h5py.File(fname_in, "r") as f_in, \
         h5py.File(fname_out, "w") as f_out:

        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

        ID_maps = dict()
        snapshot_indices = dict()

        print("")
        print("Generating the dictionary to map the oldIDs to the newIDs.")

        start_time = time.time()

        for snap_key in tqdm(Snap_Keys):
            # We only want to go through snapshots that contain halos.
            if len(f_in[snap_key][haloID_field]) == 0:
                continue

            # Need to get the indices that sort the data according to the
            # specified keys.
            indices = get_sort_indices(f_in, snap_key, sort_fields,
                                       sort_direction)

            old_haloIDs = f_in[snap_key][haloID_field][:]
            old_haloIDs_sorted = old_haloIDs[indices]

            # The ID of a halo depends on its snapshot-local index.
            # As the new haloIDs will be sorted correctly, their index will
            # simply be np.arange(len(Number of Halos)).
            new_haloIDs = cmn.index_to_temporalID(np.arange(len(indices)),
                                                  Snap_Nums[snap_key],
                                                  index_mult_factor)

            oldIDs_to_newIDs = dict(zip(old_haloIDs_sorted, new_haloIDs))

            # Now we've generated the Dicts for this snap, put them into the
            # global dictionary.  We key the ID Dict by the snapshot number
            # rather than the field name because we can access the snapshot
            # number using the oldID.

            snapshot_indices[snap_key] = indices
            ID_maps[Snap_Nums[snap_key]] = oldIDs_to_newIDs

        # For some ID fields (e.g., NextProgenitor), the value is -1.
        # When we convert from the temporalID to a snapshot number, we subtract
        # 1 and divide by the multiplication factor (default 1e12), then cast
        # to an integer. Hence -2 divided by a huge number will be less than 1
        # and when it's cast to an integer will result in 0.
        # So the 'Snapshot Number' for values of -1 will be 0.  We want to
        # preserve these -1 flags so we map -1 to -1.

        # However we also need to preserve the dictionary for `Snap_000`...

        try:
            oldID_maps_zero_keys = list(ID_maps[0].keys())
            oldID_maps_zero_values = list(ID_maps[0].values())
        except KeyError:
            ID_maps[0] = {-1: -1}
        else:
            ID_maps[0] = dict(
                zip(oldID_maps_zero_keys + [-1],
                    oldID_maps_zero_values + [-1]))

        end_time = time.time()
        print("Creation of dictionary map took {0:3f} seconds".format(
            end_time - start_time))
        print("")

        # At this point we have the dictionaries that map the oldIDs to the
        # newIDs in addition to the indices that control the sorting of the
        # forests.  We now loop through all the fields within each halo within
        # each snapshot and if the field contains a haloID we update it.
        # While going through each field, we will then write out the data into
        # a new HDF5 file in the order specified by indices.

        print("")
        print("Now writing out the snapshots in the sorted order.")
        start_time = time.time()

        # Don't use name `snap_key` because there could be other fields such as
        # 'header'.
        for key in tqdm(f_in.keys()):
            cmn.copy_group(f_in, f_out, key)

            if key in Snap_Keys:
                try:
                    oldIDs = list(ID_maps[Snap_Nums[key]].keys())
                except KeyError:
                    pass
                else:
                    dataset_name = "{0}/oldIDs".format(key)
                    f_out.create_dataset(dataset_name, data=oldIDs)

            for field in f_in[key]:

                # Some keys (e.g., 'Header') don't have snapshots so need an
                # except to catch this.
                try:
                    NHalos = len(f_in[key][haloID_field])
                    if (NHalos == 0):
                        continue
                except KeyError:
                    continue

                if field in ID_fields:  # If this field has an ID...
                    # Need to get the oldIDs, find the snapshot they correspond
                    # to and then get the newIDs using our dictionary.
                    oldID = f_in[key][field][:]
                    snapnum = cmn.temporalID_to_snapnum(
                        oldID, index_mult_factor)

                    newID = np.empty(len(oldID))

                    for count, (snap, ID) in enumerate(zip(snapnum, oldID)):
                        try:
                            newID[count] = ID_maps[snap][ID]
                        except KeyError:
                            print(
                                "Encountered a KeyError when mapping the oldID "
                                "to the newID.")
                            print("Field {0} \tSnapnum {1}\tOldID "
                                  "{2}\tID_maps[snap] {2}".format(
                                      field, snapnum, ID, ID_maps[snap]))
                            raise KeyError

                    to_write = np.array(
                        newID)  # Remember what we need to write.
                else:
                    to_write = f_in[key][field][:]

                # We know what we need to write, so let's write it in the
                # correct order.
                f_out[key][field][:] = to_write[snapshot_indices[key]]

        end_time = time.time()
        print("Writing of snapshots took {0:3f} seconds".format(end_time -
                                                                start_time))
        print("Done!")
        print("")
Ejemplo n.º 3
0
def my_test_sorted_properties(
        fname_in=default_fname_in,
        fname_out=default_fname_out,
        ID_fields="ID",
        sort_fields=["ForestID", "hostHaloID", "Mass_200mean"],
        sort_direction=[1, 1, -1],
        gen_data=0):
    """
    Ensures that the halo properties were sorted and saved properly.

    Note: The non-ID fields are not checked here because they are
    wrong by design.  If HaloID 1900000000001 had a descendant pointer
    (i.e., a 'Head' point in Treefrog) of 2100000000003, this may not
    be true because the ID of Halo 2100000000003 may be changed.

    Parameters
    ----------

    fname_in, fname_out: String. Default: `<test_directory>/test_data.hdf5`,
    `<test_directory>/test_sorted.hdf5`.
        Path to the input HDF5 trees and path to where the sorted trees were
        saved.

    haloID_field: String. Default: 'ID'.
        Field name within the HDF5 file that corresponds to the unique halo ID.

    sort_fields: List of strings. Default: ['ForestID', 'hostHaloID',
                                            'Mass_200mean'].
        The HDF5 field names that the sorting was performed on. The entries
        are ordered such that the first field will be the outer-most sort and
        the last field will be the inner-most sort.

    sort_direction : List of integers, optional
        Specifies the direction in which the sorting will occur for each
        ``sort_field`` entry. 1 corresponds to ascending, -1 to descending.

    gen_data: Integer. Default: 0.
        Flag whether this function was called using sorted trees generated for
        testing purposes.  If this flag is set to 1 then the sorted trees will
        be removed if the test fails.

    Returns
    ----------

    None. ``Pytest.fail()`` is invoked if the test fails.
    """

    with h5py.File(fname_in, "r") as f_in, \
         h5py.File(fname_out, "r") as f_out:

        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_out.keys())

        print("Looping over each Snapshot")
        for snap_key in tqdm(Snap_Keys):  # Now let's check each field.
            for field in f_out[snap_key]:
                if field in ID_fields:  # Ignore ID fields.
                    continue

                if field == "oldIDs":  # Original input doesn't have oldIDs.
                    continue

                indices = fs.get_sort_indices(f_in, snap_key, sort_fields,
                                              sort_direction)

                input_data = f_in[snap_key][field][:]
                input_data_sorted = input_data[indices]
                output_data = f_out[snap_key][field][:]

                if not np.array_equal(output_data, input_data_sorted):
                    print("For snapshot number {0}, there was a mistmach for "
                          "field {1} between the sorted input data and the "
                          "data stored in the output file.".format(
                              Snap_Nums[snap_key], field))
                    print("The raw input data is {0}.  The supposed indices "
                          "that would sort this data is {1} corresponding to "
                          "'sorted' input data of {2}.  However the data "
                          "stored in the output file is {3}".format(
                              input_data, indices, input_data_sorted,
                              output_data))

                    if gen_data:
                        cleanup(fname_in)
                    pytest.fail()
Ejemplo n.º 4
0
def my_test_sorted_order(
        fname_out=default_fname_out,
        haloID_field="ID",
        sort_fields=["ForestID", "hostHaloID", "Mass_200mean"],
        sort_direction=[1, 1, -1],
        gen_data=0):
    """
    Checks the indices of the output file to ensure sorting order is correct.

    Calls `recursively_check_sort` for each halo which iterates through the
    sorted fields to ensure all the sorted is correct.

    Parameters
    ----------

    fname_out: String. Default: `<test_directory>/test_sorted.hdf5`.
        Path to the sorted HDF5 trees we're testing.
        saved.

        ..note::
            If `gen_data=1` the sorted trees will be removed upon exit.

    haloID_field: String. Default: 'ID'.
        Field name within the HDF5 file that corresponds to the unique halo ID.

    sort_fields: List of strings. Default: ['ForestID', 'hostHaloID',
                                            'Mass_200mean'].
        The HDF5 field names that the sorting was performed on. The entries
        are ordered such that the first field will be the outer-most sort and
        the last field will be the inner-most sort.

    sort_direction : List of integers, optional
        Specifies the direction in which the sorting will occur for each
        ``sort_field`` entry. 1 corresponds to ascending, -1 to descending.

    gen_data: Integer. Default: 0.
        Flag whether this function was called using sorted trees generated for
        testing purposes.  If this flag is set to 1 then the sorted trees will
        be removed if the test fails.

    Returns
    ----------

    None.

    `~pytest.fail()` is invoked by `recursively_check_sort()` if the test fails.
    """
    def recursively_check_sort(snapshot_data, sort_fields, sort_direction,
                               sort_level, halo_idx, gen_data):
        """
        Moves through the sort level, checking that each key was sorted.
        """

        # Our checking goes from outer-most to inner-most.  If the user didn't want
        # to sort on 4 fields and used None, then we stop recursively calling.
        key = sort_fields[sort_level]
        if key is None or "NONE" in key.upper():
            return

        values = snapshot_data[key][:]

        this_value = values[halo_idx]
        this_id = values[halo_idx]

        next_value = values[halo_idx + 1]
        next_id = values[halo_idx + 1]

        # If the values are equal, we need to move to the next sort level.  However
        # if we're currently at the inner-most level then the sorting is still done
        # correctly (equal values next to each other).
        if this_value == next_value \
           and sort_level < (len(sort_fields) - 1):
            recursively_check_sort(snapshot_data, sort_fields, sort_direction,
                                   sort_level + 1, halo_idx, gen_data)

        # Otherwise if we haven't sorted correctly in ascended order, print a
        # message and fail the test.
        elif (this_value > next_value and sort_direction[sort_level] == 1) or \
             (this_value < next_value and sort_direction[sort_level] == -1):
            print(
                "For Halo ID {0} we had a {1} value of {2}.  After sorting "
                "via lexsort using the fields {3} (inner-most sort first), "
                "the next in the sorted list has ID {4} and a {1} value of {5}"
                .format(this_id, key, this_id, sort_fields, next_id, next_id))

            if gen_data:
                cleanup()
            pytest.fail()

        return None

    with h5py.File(fname_out, "r") as f_in:

        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

        print("Looping over each snapshot.")
        for snap_key in tqdm(Snap_Keys):
            NHalos = len(f_in[snap_key][haloID_field])
            if NHalos < 2:  # Skip snapshots that wouldn't be sorted.
                continue

            # Since the user specifies 4 keys that they wish to sort on (with
            # some these potentially being None), we need to check that every
            # key has been sorted correctly.
            #
            # To do this we loop over the halos within a snapshot and first
            # check the outer-most key.  If halo[i] has the same outer-key as
            # halo[i + 1] we need to check an inner-key to ensure it's sorted.

            for idx in range(NHalos - 1):
                recursively_check_sort(f_in[snap_key], sort_fields,
                                       sort_direction, 0, idx, gen_data)
Ejemplo n.º 5
0
def my_test_check_haloIDs(fname_in=default_fname_in,
                          fname_out=default_fname_out,
                          haloID_field="ID",
                          index_mult_factor=1e12,
                          gen_data=0):
    """
    Checks the sorted haloIDs and snapshot numbers match the formula.

    This formula is the one that turns the snapshot-local halo index into a
    temporally unique ID.

    Parameters
    ----------

    fname_in, fname_out: String. Default: `<test_directory>/test_data.hdf5`,
    `<test_directory>/test_sorted.hdf5`.
        Path to the input HDF5 trees and path to where the sorted trees were
        saved.

    haloID_field: String. Default: 'ID'.
        Field name within the HDF5 file that corresponds to the unique halo ID.

    index_mult_factor: Integer. Default: 1e12.
        Multiplication factor to generate a temporally unique halo ID. See
        `common.index_to_temporalID`.

    gen_data: Integer. Default: 0.
        Flag whether this function was called using sorted trees generated for
        testing purposes.  If this flag is set to 1 then the sorted trees will
        be removed if the test fails.

    Returns
    ----------

    None.

    `~pytest.fail()` is invoked if the test fails.
    """

    files = [fname_in, fname_out]

    for file_to_test in files:
        print("Checking that the HaloIDs are correct for file "
              "{0}".format(file_to_test))
        with h5py.File(file_to_test, "r") as f_in:
            Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

            print("Looping over each Snapshot.")
            for snap_key in tqdm(Snap_Keys):
                if len(f_in[snap_key][haloID_field]) == 0:
                    continue

                file_haloIDs = f_in[snap_key][haloID_field][:]
                generated_haloIDs = cmn.index_to_temporalID(
                    np.arange(len(file_haloIDs)), Snap_Nums[snap_key],
                    index_mult_factor)

        if not np.array_equal(generated_haloIDs, file_haloIDs):
            print("The HaloIDs within file '{0}' were not correct.".format(
                file_to_test))
            print("HaloIDs were {0} and the expected IDs were {1}.".format(
                file_haloIDs, generated_haloIDs))
            print("If this is the test input data file, then your input data "
                  "may be wrong!  If this is the test sorted output file, "
                  "contact [email protected]")

            if gen_data:
                cleanup(fname_in)
            pytest.fail()
Ejemplo n.º 6
0
def treefrog_to_lhalo(fname_in,
                      fname_out,
                      haloID_field="ID",
                      forestID_field="ForestID",
                      Nforests=None,
                      write_binary_flag=1):
    """
    Takes the Treefrog trees that have had their IDs corrected to be in LHalo
    format and saves them in LHalo binary format.

    The data-structure of the Treefrog trees is assumed to be HDF5 File ->
    Snapshots -> Halo Properties at each snapshot.

    .. note::
        We require the input trees to be sorted via the forest ID
        (``forestID_field``) and suggest to also sub-sort on hostHaloID and mass.
        Sorting can be done using :py:mod:`astro3D.genesis.utils.forest_sorter`.

        We also require the input trees to have IDs that are LHalo compatible.
        See :py:mod:`astro3D.genesis.utils.convert_indices`.

    Parameters
    ----------

    fname_in, fname_out : String
        Path to the input HDF5 VELOCIraptor + treefrog trees and the path
        where the LHalo binary file will be saved.

    haloID_field : String, optional
        Field name within the HDF5 file that corresponds to the unique halo ID.

    forestID_field : String, optional
        Field name within the HDF5 file that corresponds to forest ID.

    Nforests : Integer, optional
        The number of forests to be processed. If ``None`` is passed then all
        forests are processed.

    write_binary_flag : Integer, optional
        Flag to decide whether to write to a binary or HDF5 file.
        0: HDF5 file only.
        1: Binary file only.
        2: Both binary and HDF5 file.

    Returns
    ----------

    None.

    Notes
    ----------

    The default parameters are chosen to match the ASTRO3D Genesis trees as
    produced by VELOCIraptor + Treefrog.
    """

    if rank == 0:
        print("")
        print("=================================")
        print("Going through the LHalo indices corrected Treefrog trees and "
              "saving in LHalo binary format.")
        print("Input Trees: {0}".format(fname_in))
        print("Output LHalo Trees: {0}".format(fname_out))
        print("ForestID Field Name: {0}".format(forestID_field))
        print("Number of Processors: {0}".format(size))
        print("Number of forests to process: {0}".format(Nforests))
        print("Write Binary Flag: {0}".format(write_binary_flag))
        print("=================================")
        print("")

    LHalo_Desc, multipledim_names = get_LHalo_datastruct()

    with h5py.File(fname_in, "r") as f_in:
        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

        NHalos_forest, NHalos_forest_offset = cmn.get_halos_per_forest(
            f_in, Snap_Keys, haloID_field, forestID_field)

        # Find the max value of the object where the compared
        # values are returned via the "key". In this case,
        # compares the integer Snapshot number values, and then
        # returns the "Snapshot_group" key in the Snap_Keys dictionary.
        # Taken from
        # https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
        last_snap_key = max(Snap_Nums, key=Snap_Nums.get)

        total_forests_to_process = np.unique(
            f_in[last_snap_key][forestID_field][:])
        print("Total forests {0}".format(len(total_forests_to_process)))

        if Nforests:
            total_forests_to_process = total_forests_to_process[0:int(Nforests
                                                                      )]

        # If we're running in parallel, determine what forest IDs each
        # processor is handling.
        if size > 1:
            forests_to_process = determine_forests(NHalos_forest,
                                                   total_forests_to_process)

        else:
            forests_to_process = total_forests_to_process

        filenr = rank

        # We first want to determine the number of forests, total number of
        # halos in these forests, and number of halos per forest for each
        # forest we are processing.
        totNHalos = 0
        global_halos_per_forest = []

        for forestID in forests_to_process:
            # NHalos_forest is a nested dictionary accessed by each forestID.
            halos_per_forest = sum(NHalos_forest[forestID].values())
            global_halos_per_forest.append(halos_per_forest)
            totNHalos += halos_per_forest

        # Write out the header with all this info.
        print("Rank {0} writing {1} forests containing a total of {2} "
              "halos.".format(rank, len(forests_to_process), totNHalos))
        if write_binary_flag == 1 or write_binary_flag == 2:
            my_fname_out = "{0}.{1}".format(fname_out, rank)
        else:
            my_fname_out = "{0}.{1}.hdf5".format(fname_out, rank)

        write_header(my_fname_out, len(forests_to_process), totNHalos,
                     global_halos_per_forest, write_binary_flag)

        start_time = time.time()
        hubble_h = get_hubble_h(f_in)
        # Now for each forest we want to populate the LHalos forest struct, fix
        # any IDs (e.g., flybys) and then write them out.
        if write_binary_flag == 1 or write_binary_flag == 2:
            f_out = open(my_fname_out, "ab")
        else:
            f_out = h5py.File(my_fname_out, "a")

        for count, forestID in enumerate(forests_to_process):
            if count % 500 == 0:
                print("Rank {0} processed {1} Forests ({2:.2f} seconds "
                      "elapsed).".format(rank, count,
                                         time.time() - start_time))
            NHalos = sum(NHalos_forest[forestID].values())

            forest_halos = np.zeros(NHalos, dtype=LHalo_Desc)
            forest_halos = populate_forest(f_in, forest_halos, Snap_Keys,
                                           Snap_Nums, forestID, NHalos_forest,
                                           NHalos_forest_offset, filenr,
                                           hubble_h)

            NHalos_root = NHalos_forest[forestID][last_snap_key]
            forest_halos, true_fof_idx, flyby_inds = fix_flybys(
                forest_halos, NHalos_root)

            # Now if there were any flybys, we need to update the
            # `NextHaloInFOFgroup` chain to account for them.
            if true_fof_idx:
                # We do this by starting at the main FoF group and moving until we reach
                # the end (`NextHaloInFOFgroup = -1`).  Then we attach the first flyby halo
                # onto the end.  We then move down THAT flyby's chain and repeat the
                # process.
                next_in_chain = forest_halos["NextHaloInFOFgroup"][
                    true_fof_idx]
                curr_halo = true_fof_idx

                for flyby_ind in flyby_inds:
                    while next_in_chain != -1:
                        curr_halo = next_in_chain
                        next_in_chain = forest_halos["NextHaloInFOFgroup"][
                            next_in_chain]
                    forest_halos["NextHaloInFOFgroup"][curr_halo] = flyby_ind
                    next_in_chain = flyby_ind

                # After this there should only be one halo at the root
                # snapshot with `NextHaloInFOFgroup == -1`.
                assert(len(np.where(forest_halos["NextHaloInFOFgroup"][0:NHalos_root] \
                                    == -1)[0]) == 1)
            # Flybys and `NextHaloInFOFgroup` now fixed.

            forest_halos = fix_nextprog(forest_halos)

            # The VELOCIraptor + Treefrog trees point to themselves when
            # they terminate.  However LHalo Trees requires these to be -1,
            # so find the instances where `NextProgenitor` and
            # `FirstProgenitor` point to themselves and adjust them to -1.
            w = np.arange(NHalos)
            NextProg_tofix = [
                x for x in w if x == forest_halos["NextProgenitor"][x]
            ]
            FirstProg_tofix = [
                x for x in w if x == forest_halos["FirstProgenitor"][x]
            ]

            forest_halos["NextProgenitor"][NextProg_tofix] = -1
            forest_halos["FirstProgenitor"][FirstProg_tofix] = -1

            # All done! Append to the file.

            if write_binary_flag == 1 or write_binary_flag == 2:
                forest_halos.tofile(f_out)
            else:
                group_name = "tree_{0:03d}".format(forestID)
                f_out.create_group(group_name)

                # Need to be careful here.  Some properties (e.g., 'Position') we
                # want to save as Nx3 arrays (instead of 3 Nx1 arrays).  So
                # first save only those arrays that aren't multi-dimensional.
                for subgroup_name in LHalo_Desc.names:
                    if not cmn.search_dict_of_lists(subgroup_name,
                                                    multipledim_names):
                        f_out[group_name][subgroup_name] = forest_halos[
                            subgroup_name]

                # Then go through all the multi-dimensional arrays and save
                # an Nx3 array that contains all the data.
                for name in multipledim_names:

                    # Initialize an Nx3 array.
                    Ndim = len(multipledim_names[name])
                    array = np.zeros((len(forest_halos), Ndim))

                    # Then populate that array.
                    for dim, dim_name in enumerate(multipledim_names[name]):
                        array[:, dim] = forest_halos[dim_name]

                    # Finally save it.
                    f_out[group_name][name] = array

        # End of Forests Loop.

        f_out.close()

    # Input HDF5 file closed.

    print("Rank {0} has finished writing out {1} Forests to "
          "{2}".format(rank, len(forests_to_process), my_fname_out))
    print("Total time elapsed: {0:.2f} Seconds.".format(time.time() -
                                                        start_time))

    # If the user set `write_binary_flag == 2`, then convert the binary file to
    # a HDF5 one.

    if write_binary_flag == 2:
        hdf5_fname_out = "{0}.{1}.hdf5".format(fname_out, rank)
        convert_binary_to_hdf5(my_fname_out, hdf5_fname_out)
        print("Binary file also converted to HDF5.")
Ejemplo n.º 7
0
def convert_indices(
    fname_in,
    fname_out,
    haloID_field="ID",
    forestID_field="ForestID",
    ID_fields=["Head", "Tail", "RootHead", "RootTail", "ID", "hostHaloID"],
    index_mult_factor=int(1e12)):
    """
    Converts temporally unique tree IDs to ones that are forest-local as
    required by the LHalo Trees format.

    The data-structure of the Treefrog trees is assumed to be HDF5 File ->
    Snapshots -> Halo Properties at each snapshot.

    A new HDF5 file is saved out with the updated IDs.

    .. note::
        We require the input trees to be sorted via the forest ID
        (``forestID_field``) and suggest to also sub-sort on ``hostHaloID`` and
        mass. Sorting can be done using :py:mod:`astro3D.genesis.utils.forest_sorter`.

    Parameters
    ----------

    fname_in, fname_out : String
        Path to the input HDF5 VELOCIraptor + treefrog trees and the path
        where the LHalo correct trees will be saved.

    haloID_field : String, optional
        Field name within the HDF5 file that corresponds to the unique halo ID.

    forestID_field : String, optional
        Field name within the HDF5 file that corresponds to forest ID.

    ID_fields : List of strings, optional
        The HDF5 field names that correspond to properties that use halo IDs.
        As the halo IDs are updated to match the required LHalo Tree format,
        these must also be updated.

    index_mult_factor : Integer, optional
        Multiplication factor to generate a temporally unique halo ID.

    Returns
    ----------

    None.

    Notes
    ----------

    The default parameters are chosen to match the ASTRO3D Genesis trees as
    produced by VELOCIraptor + Treefrog.
    """

    print("")
    print("=================================")
    print("Converting Treefrog IDs to LHalo Tree indices.")
    print("Input Trees: {0}".format(fname_in))
    print("Output LHalo ID Trees: {0}".format(fname_out))
    print("Foest ID Field Names: {0}".format(forestID_field))
    print("ID Field Names: {0}".format(ID_fields))
    print("ForestID Field Name: {0}".format(forestID_field))
    print("Index Mult Factor: {0}".format(index_mult_factor))
    print("=================================")
    print("")

    with h5py.File(fname_in, "r") as f_in, \
         h5py.File(fname_out, "w") as f_out:

        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

        NHalos_forest, NHalos_forest_offset = cmn.get_halos_per_forest(
            f_in, Snap_Keys, haloID_field, forestID_field)

        print("Copying the old tree file to a new one.")
        for key in tqdm(f_in.keys()):
            cmn.copy_group(f_in, f_out, key)

        print("Now creating a dictionary that maps the old, global indices to "
              "ones that are forest-local.")

        start_time = time.time()

        NHalos_processed = np.zeros(len(NHalos_forest.keys()))
        Forests_InSnap = {}
        ID_maps = {}
        for snap_key in tqdm(Snap_Keys[::-1]):
            try:
                NHalos = len(f_in[snap_key][haloID_field])
                if (NHalos == 0):
                    continue
            except KeyError:
                continue

            oldIDs_global = []
            newIDs_global = []

            forests_thissnap = np.unique(f_in[snap_key][forestID_field][:])

            Forests_InSnap[snap_key] = forests_thissnap

            oldIDs = f_in[snap_key][haloID_field][:]

            for forest in forests_thissnap:

                NHalos_snapshot = NHalos_forest[forest][snap_key]
                offset = NHalos_forest_offset[forest][snap_key]

                idx_lower = offset
                idx_upper = NHalos_snapshot + offset

                oldIDs_thisforest = oldIDs[idx_lower:idx_upper]
                newIDs_thisforest = np.arange(
                    NHalos_processed[forest - 1],
                    NHalos_processed[forest - 1] + NHalos_snapshot)

                for val1, val2 in zip(oldIDs_thisforest, newIDs_thisforest):
                    oldIDs_global.append(int(val1))
                    newIDs_global.append(int(val2))

                NHalos_processed[forest - 1] += NHalos_snapshot

            oldIDs_to_newIDs = dict(
                zip(list(oldIDs_global), list(newIDs_global)))
            ID_maps[Snap_Nums[snap_key]] = oldIDs_to_newIDs

        # For some ID fields (e.g., NextProgenitor), the value is -1.
        # When we convert from the temporalID to a snapshot number, we
        # subtract 1 and divide by the multiplication factor (default 1e12)
        # then cast to an integer.  Hence -2 divided by a huge number will
        # be less than 1 and when it's cast to an integer will result in 0.
        # So the 'Snapshot Number' for values of -1 will be 0.  We want to
        # preserve these -1 flags so we map -1 to -1.
        ID_maps[0] = {-1: -1}

        end_time = time.time()
        print("Creation of dictionary took {0:3f} "
              "seconds.".format(end_time - start_time))

        print("Now going through all the snapshots and updating the IDs.")
        start_time = time.time()

        for snap_key in tqdm(Snap_Keys):
            try:
                NHalos = len(f_in[snap_key][haloID_field])
                if (NHalos == 0):
                    continue
            except KeyError:
                continue

            forests_thissnap = np.unique(f_in[snap_key][forestID_field][:])

            for field in ID_fields:  # If this field has an ID...

                oldID = f_in[snap_key][field][:]
                snapnum = cmn.temporalID_to_snapnum(oldID, index_mult_factor)

                # We now want to map the oldIDs to the new, forest-local
                # IDs.  However because you can't hash a dictionary with a
                # numpy array, this needs to be done manually in a `for`
                # loop.

                newID = [ID_maps[snap][ID] for snap, ID in zip(snapnum, oldID)]
                f_out[snap_key][field][:] = newID

            # Field Loop.
        # Snapshot Loop.

        end_time = time.time()
        print("Updating took {0:.3f} seconds.".format(end_time - start_time))
Ejemplo n.º 8
0
def adjust_spec(fname_in,
                fname_out,
                haloID_field="ID",
                FirstHaloInFOFgroup_field="hostHaloID",
                index_mult_factor=int(1e12)):
    """
    Adjusts some fields of the VELOCIraptor trees to match the LHaloTree Specs.

    Currently calls the following functions:
        1. :py:mod:`astro3D.genesis.utils.`adjust_hostHaloID`

    Parameters
    ----------

    fname_in, fname_out : String
        Path to the input HDF5 trees and path to where the updated trees will be
        saved.

    haloID_field : String, optional
        Field name within the HDF5 file that corresponds to the unique halo ID.

    FirstHaloInFOFgroup_field : String, optional
        Field name within the HDF5 file that corresponds to
        `FirstHaloInFOFgroup` in the LHaloTree structure.

    index_mult_factor: Integer, optional
        Multiplication factor to generate a temporally unique halo ID.

    Returns
    ----------

    None.

    Notes
    ----------

    The default parameters are chosen to match the ASTRO3D Genesis trees as
    produced by VELOCIraptor + Treefrog.
    """

    print("")
    print("=================================")
    print("Running adjust_spec")
    print("Input Trees: {0}".format(fname_in))
    print("Output Trees: {0}".format(fname_out))
    print("HaloID Field: {0}".format(haloID_field))
    print("Index Mult Factor: {0}".format(index_mult_factor))
    print("=================================")
    print("")

    start_time = time.time()

    with h5py.File(fname_in, "r") as f_in, \
         h5py.File(fname_out, "w") as f_out:

        Snap_Keys, Snap_Nums = cmn.get_snapkeys_and_nums(f_in.keys())

        print("Copying the file over")
        for key in tqdm(f_in.keys()):
            # First copy all the groups to the output file.
            cmn.copy_group(f_in, f_out, key)
        print("Done!")

        # Then adjust all the things we need to adjust.
        print(
            "Adjusting the hostHaloIDs to point to themselves rather than -1.")
        adjust_hostHaloID(f_out, haloID_field, FirstHaloInFOFgroup_field,
                          Snap_Keys, Snap_Nums, index_mult_factor)
        print("Done!")

    end_time = time.time()
    print("Took {0:3f} seconds".format(end_time - start_time))