コード例 #1
0
def IDL_read_file(idl_file):
    """
    Reads an IDL file and converts it to a Python dictionary

    Parameters
    ----------
    idl_file : string
        Path to the filename being used

    Returns
    ----------
    idl_dict : python dictionary
        Dictionary with the data from `idl_file`
    """
    # Checking that file exists
    fd.File_Exists(idl_file)
    # Converting to dictionary
    try:
        idl_dict = readsav(idl_file, python_dict=True)
    except:
        msg = '{0} `idl_file` {0} is not an IDL file'.format(
            fd.Program_Msg(__file__), idl_file)
        raise LSSUtils_Error(msg)

    return idl_dict
コード例 #2
0
def pandas_file_to_hdf5_file(df_file, hdf5_file, key=None, mode='w'):
    """
    Converts a HDF5 with pandas format and converts it to normal HDF5 file

    Paramters
    ---------
    df_file : str
        Path to the `df_file` containing the pandas DataFrame to be converted

    hdf5_file : str
        Path to the output HDF5 file containg arrays as keys

    key : str or NoneType, optional
        Key or path in HDF5 file for the `df_file` and `hdf5_file`
    """
    file_msg = fd.Program_Msg(__file__)
    fd.File_Exists(filename)
    # Reading in DataFrame
    if not key:
        data, key = read_pandas_hdf5(df_file, key=None, ret=True)
    else:
        data = read_pandas_hdf5(df_file, key=key)
    # Rearranging data
    arr_names = data.dtypes.index.values
    dtype_arr = data.dtypes.values
    dtypes_arr = np.array([x.str for x in dtypes_arr])
    data_dtypes = np.dtype(zip(arr_names, dtypes_arr))
    dataset = np.recarray((len(data), ), dtype=data_dtypes)
    for name in dataset.dtype.names:
        dataset[name] = data[name]
    # Saving file to HDF5 format
    hdf5_obj = h5py.File(hdf5_file, mode=mode)
    hdf5_obj.create_dataset(key, data=dataset)
    hdf5_obj.close()
    msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file)
コード例 #3
0
def read_hdf5_file_to_pandas_DF(hdf5_file, key=None):
    """
    Reads content of HDF5 file and converts it to a Pandas DataFrame

    Parameters
    ----------
    hdf5_file : str
        Path to the HDF5 file. This is the file that will be converted 
        to a pandas DataFrame.

    key : str or NoneType, optional
        Key or path in `hdf5_file` for the pandas DataFrame and the normal 
        HDF5 file.

    Returns
    ----------
    df : `pandas.DataFrame`
        DataFrame from `hdf5_file` under the `key` directory.
    """
    file_msg = fd.Program_Msg(__file__)
    fd.File_Exists(hdf5_file)
    # Reading in Pandas DataFrame
    try:
        df = pd.read_hdf(hdf5_file, key=key)
    except:
        msg = '{0} Could not read `hdf5_file` ({1})! Please check if it exists'
        msg = msg.format(file_msg, hdf5_file)
        raise LSSUtils_Error(file_msg)

    return df
コード例 #4
0
def read_pandas_hdf5(hdf5_file, key=None, ret=False):
    """
    Reads a HDF5 file that contains one or many datasets.
    It converts it into a pandas DataFrame.

    Parameters
    ----------
    hdf5_file : str
        Path to the HDF5 file containing one or more pandas DataFrame(s).

    key : str or NoneType
        If provided, it will extract the `key` value as a pandas DataFrame.
        This value is set to `None` by default.

    ret : boolean, optional
        If True, it returns the value of the `key`.
        By default, it is set to False.

    Returns
    ----------    
    df : `pandas.DataFrame`
        DataFrame from the `hdf5_file` with the data from the `key` directory
    """
    file_msg = Program_Msg(__file__)
    # Checking that file exists
    fd.File_Exists(hdf5_file)
    # Checking number of keys
    hdf5_obj = pd.HDFStore(hdf5_file)
    hdf5_keys = [ii for ii in hdf5_obj.keys()]
    hdf5_obj.close()
    # Reading in HDF5 file
    if key == None:
        try:
            df = pd.read_hdf(hdf5_file)
            if ret:
                return df, hdf5_keys[0]
            else:
                return df
        except:
            msg = '{0} Must specify which key to use:\n\t'.format(file_msg)
            msg += 'Possible keys: \n'
            print(msg)
            for key_i, name in enumerate(hdf5_keys):
                print('\t Key {0}:  {1}'.format(key_i, name))
    else:
        if key not in hdf5_keys:
            print('{0} Key not in the file: '.format(file_msg))
            print('Possible Keys:\n')
            for key_i, name in enumerate(hdf5_keys):
                print('\t Key {0}:  {1}'.format(key_i, name))
        else:
            df = pd.read_hdf(hdf5_file, key=key)
            if ret:
                return df, key
            else:
                return df
コード例 #5
0
def concatenate_pd_df(directory, filetype='hdf5', foutput=None, outonly=True):
    """
    Concatenates pandas DataFrames into a single DataFrame

    Parameters
    ----------
    directory : str
        Path to the folder containing multiple pandas-HDF5 files

    filetype : str, optional
        File format of the file in `directory` to be read
        This is set to `hdf5` by default.

    foutput : str or NoneType
        If not `None`, it is the basename of the output file in HDF5 format

    outonly : boolean, optional
        If True, it returns the pandas DataFrame.
        If False, it only saved the concatenated `pandas.DataFrame`.

    Returns
    ----------
    df_conc : `pandas.DataFrame`
        DataFrame containing the combined datasets from the files in
        `directory`.

    Raises
    ----------
    LSSUtils_Error : Exception
        If no files are found in `directory`, it raises an error 
        warning about this.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking that `directory` exists
    if not os.path.exists(directory):
        msg = '{0} `directory` {1} is not a valid path! Exiting!'.format(
            file_msg, directory)
        raise LSSUtils_Error(msg)
    # Concatenating files
    files_arr = df.index(directory, '.' + filetype, sort=True)
    print('{0} Found `{1}` files'.format(file_msg, files_arr.size))
    if len(files_arr) > 0:
        # Initializing array that contains info
        df_arr = [[] for x in range(len(files_arr))]
        # Looping over HDF5 (pandas) files
        for ii, file_ii in enumerate(files_arr):
            df_arr[ii] = read_pandas_hdf5(file_ii)
        # Concatenating arrays
        df_conc = pd.concat(df_arr, ignore_index=True)
        # Deciding name of resulting output file
        if (foutput is not None) and (type(foutput) == str):
            foutput_file = os.path.join(directory,
                                        '{0}.{1}'.format(foutput, filetype))
            # Saving resulting DataFrame
            pandas_df_to_hdf5_file(df_conc, foutput_file, key='/Main')
            # Checking file exists
            fd.File_Exists(foutput_file)
            print('{0} Output file saved in: {2}'.format(
                file_msg, foutput_file))
        # If only outputting concatenated DataFrame
        if outonly:
            return df_conc
    else:
        msg = '{0} No files in `{1}` with extension `{2}`'.format(
            file_msg, directory, filetype)
        raise LSSUtils_Error(msg)
コード例 #6
0
def catl_sdss_merge(catl_pd_ii,
                    catl_kind='data',
                    catl_type='mr',
                    sample_s='19',
                    halotype='fof',
                    clf_method=3,
                    hod_n=0,
                    clf_seed=1235,
                    perf_opt=False,
                    return_memb_group=False,
                    print_filedir=False):
    """
    Merges the member and group catalogues for a given set of input parameters,
    and returns a modified version of the galaxy group catalogues with added
    info about the galaxy groups.

    Parameters
    ------------
    catl_pd_ii : int
        Index of the catalogue to match, 
        from :func:`~cosmoutils.mock_catalogues.catls_utils.extract_catls`
        function.

    catl_kind : {'data', 'mocks'} str, optional
        Type of catalogue to use. This variable is set to `data` by default.

        Options:
            - `data` : catalogues come from SDSS `real` catalogue
            - `mocks` : catalogue come from SDSS `mock` catalogues

    catl_type : {'mr', 'mstar'} str, optional
        Type of catalogue to use. It shows which abundance matching method
        was used for the CLF when assigning halo masses. This variable is 
        set to 'mr' by default.

        Options:
            - `mr` : Uses r-band absolute magnitude
            - `mstar` : Uses stellar masses

    sample_s : {'19', '20', '21'} str, optional
        Volume-limited sample to use. This variable is set to '19' by default.

        Options:
            - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo'
            - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda'
            - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen'

    halotype : {'fof', 'so'} str, optional
        Type of the dark matter halo of the simulation used to create the 
        synthetic catalogues. This variable is set to `fof` by default.

        Options:
            - 'fof': Friends-of-Friends halos.
            - 'so' : Spherical overdensity halos.

    clf_method : {1, 2, 3} int, optional
        Method for assigning galaxy properties to mock galaxies.
        This variable is set to `3` by default.

        Options:
            - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr)
            - `2` : (g-r) decides active/passive designation and draw values 
                    independently.
            - `3` : (g-r) decides active/passive designations, and 
                    assigns other galaxy properties for that given galaxy.

    hod_n : {0, 1} int, optional
        HOD model to use. Only relevant when `catl_kind == mocks`.

    clf_seed : int, optional
        Seed used for the `CLF` random seed. This variable is set to `1235` 
        by default.

    perf_opt : boolean, optional
        If True, it chooses to analyze the `perfect` set of synthetic
        catalogues. This variable is set to `False` by default.

    return_memb_group :  `bool`, optional
        If True, the function returns the member and group catalogues,
        along with the merged catalogue.
        It returns ``<memb_group_pd, memb_pd, group_pd>``

    print_filedir : boolean, optional
        If True, the output directory is printed onto the screen.

    Return
    ------------
    memb_group_pd : `pandas.DataFrame`
        Combined version of the i-th member and group catalogues.
        It contains both galaxy and group information.

    memb_pd : `pandas.DataFrame`
        Catalogue of the member galaxies of the i-th catalogue.
        This catalogue contains information of the `member galaxies`.

    group_pd : `pandas.DataFrame`
        Catalogue of the groups of the i-th catalogue.
        This catalogue contains information of the `galaxy groups`.
    
    Raises
    ------------
    LSSUtils_Error : Exception from `LSSUtils_Error`
        Program exception if input parameters are accepted.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    catl_pd_ii_valid = (float, int)
    catl_kind_valid = ['data', 'mocks']
    catl_type_valid = ['mr', 'mstar']
    sample_s_valid = ['19', '20', '21']
    catl_info_valid = ['members', 'groups']
    halotype_valid = ['fof', 'so']
    clf_method_valid = [1, 2, 3]
    hod_n_valid = [0, 1]
    # `catl_pd_ii`
    if (isinstance(catl_pd_ii, catl_pd_ii_valid)):
        catl_pd_ii = int(catl_pd_ii)
    else:
        msg = '{0} `catl_kind` ({1}) is not a valid input!'.format(
            file_msg, type(catl_kind))
        raise LSSUtils_Error(msg)
    # `catl_kind`
    if not (catl_kind in catl_kind_valid):
        msg = '{0} `catl_kind` ({1}) is not a valid input!'.format(
            file_msg, catl_kind)
        raise LSSUtils_Error(msg)
    # `catl_type`
    if not (catl_type in catl_type_valid):
        msg = '{0} `catl_type` ({1}) is not a valid input!'.format(
            file_msg, catl_type)
        raise LSSUtils_Error(msg)
    # `sample_s`
    if not (sample_s in sample_s_valid):
        msg = '{0} `sample_s` ({1}) is not a valid input!'.format(
            file_msg, sample_s)
        raise LSSUtils_Error(msg)
    # `halotype`
    if not (halotype in halotype_valid):
        msg = '{0} `halotype` ({1}) is not a valid input!'.format(
            file_msg, halotype)
        raise LSSUtils_Error(msg)
    # `clf_method`
    if not (clf_method in clf_method_valid):
        msg = '{0} `clf_method` ({1}) is not a valid input!'.format(
            file_msg, clf_method)
        raise LSSUtils_Error(msg)
    # `hod_n`
    if not (hod_n in hod_n_valid):
        msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n)
        raise LSSUtils_Error(msg)
    # `perf_opt`
    if not (isinstance(perf_opt, bool)):
        msg = '{0} `perf_opt` ({1}) is not a valid type!'.format(
            file_msg, type(perf_opt))
        raise LSSUtils_Error(msg)
    # `return_memb_group`
    if not (isinstance(return_memb_group, bool)):
        msg = '{0} `return_memb_group` ({1}) is not a valid type!'.format(
            file_msg, type(return_memb_group))
        raise LSSUtils_Error(msg)
    # `print_filedir`
    if not (isinstance(print_filedir, bool)):
        msg = '{0} `print_filedir` ({1}) is not a valid type!'.format(
            file_msg, type(print_filedir))
        raise LSSUtils_Error(msg)
    ##
    ## Extracting catalogues given input parameters
    (memb_arr, memb_len) = extract_catls(catl_kind=catl_kind,
                                         catl_type=catl_type,
                                         sample_s=sample_s,
                                         halotype=halotype,
                                         clf_method=clf_method,
                                         hod_n=hod_n,
                                         clf_seed=clf_seed,
                                         perf_opt=perf_opt,
                                         catl_info='members',
                                         return_len=True,
                                         print_filedir=print_filedir)
    # Checking number of catalogues
    if catl_pd_ii > (memb_len - 1):
        msg = '{0} `catl_pd_ii` ({1}) is OUT of range ({2})!'.format(
            file_msg, catl_pd_ii, memb_len)
        raise LSSUtils_Error(msg)
    ##
    ## Extracting group catalogue
    # i-th Galaxy catalogue
    memb_path = memb_arr[catl_pd_ii]
    # i-th Galaxy Group catalogue
    group_path = catl_sdss_dir(catl_kind=catl_kind,
                               catl_type=catl_type,
                               sample_s=sample_s,
                               halotype=halotype,
                               clf_method=clf_method,
                               hod_n=hod_n,
                               clf_seed=clf_seed,
                               perf_opt=perf_opt,
                               catl_info='groups',
                               print_filedir=print_filedir)
    ##
    ## Paths to catalogue
    # Mocks
    if catl_kind == 'mocks':
        group_path += os.path.basename(memb_path).replace('memb', 'group')
    # Data
    if catl_kind == 'data':
        group_path += os.path.basename(memb_path).replace('Gals', 'Group')
    # Checking that file exists
    fd.File_Exists(group_path)
    ##
    ## Reading in Catalogues
    memb_pd = fr.read_hdf5_file_to_pandas_DF(memb_path)
    group_pd = fr.read_hdf5_file_to_pandas_DF(group_path)
    ## Keys for the catalogues
    (gm_key, id_key, galtype_key) = catl_keys(catl_kind,
                                              perf_opt=perf_opt,
                                              return_type='list')
    ## Matching keys from Group catalogue
    if len(np.unique(memb_pd[id_key])) == len(np.unique(group_pd[id_key])):
        # Group column names
        group_colnames = np.sort(group_pd.columns.values)
        group_groupid = np.sort(np.unique(group_pd[id_key]))
        n_groups = len(group_groupid)
        n_memb = len(memb_pd)
        ## Sorting `memb_pd` by `id_key`
        # Member catalogue
        memb_pd.sort_values(by=id_key, inplace=True)
        memb_pd.reset_index(inplace=True, drop=True)
        # Group catalogue
        group_pd.sort_values(by=id_key, inplace=True)
        group_pd.reset_index(inplace=True, drop=True)
        ## Renaming columns
        g_colnames_dict = {ii: 'GG' + ii for ii in group_colnames}
        group_pd.rename(columns=g_colnames_dict, inplace=True)
        group_pd.rename(columns={'GG' + id_key: id_key}, inplace=True)
        ##
        ## Merging the 2 DataFrames
        memb_group_pd = pd.merge(left=memb_pd,
                                 right=group_pd,
                                 how='left',
                                 left_on=id_key,
                                 right_on=id_key)
    else:
        msg = '{0} Lengths of the 2 DataFrames (`memb_pd`, `group_pd`) '
        msg += 'do not match!'
        msg = msg.format(file_msg)
        raise LSSUtils_Error(msg)
    ##
    ## Returning DataFrames
    if return_memb_group:
        return_obj = (memb_group_pd, memb_pd, group_pd)
    else:
        return_obj = memb_group_pd

    return return_obj