def IDL_read_file(idl_file): """ Reads an IDL file and converts it to a Python dictionary Parameters ---------- idl_file : string Path to the filename being used Returns ---------- idl_dict : python dictionary Dictionary with the data from `idl_file` """ # Checking that file exists fd.File_Exists(idl_file) # Converting to dictionary try: idl_dict = readsav(idl_file, python_dict=True) except: msg = '{0} `idl_file` {0} is not an IDL file'.format( fd.Program_Msg(__file__), idl_file) raise LSSUtils_Error(msg) return idl_dict
def pandas_file_to_hdf5_file(df_file, hdf5_file, key=None, mode='w'): """ Converts a HDF5 with pandas format and converts it to normal HDF5 file Paramters --------- df_file : str Path to the `df_file` containing the pandas DataFrame to be converted hdf5_file : str Path to the output HDF5 file containg arrays as keys key : str or NoneType, optional Key or path in HDF5 file for the `df_file` and `hdf5_file` """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(filename) # Reading in DataFrame if not key: data, key = read_pandas_hdf5(df_file, key=None, ret=True) else: data = read_pandas_hdf5(df_file, key=key) # Rearranging data arr_names = data.dtypes.index.values dtype_arr = data.dtypes.values dtypes_arr = np.array([x.str for x in dtypes_arr]) data_dtypes = np.dtype(zip(arr_names, dtypes_arr)) dataset = np.recarray((len(data), ), dtype=data_dtypes) for name in dataset.dtype.names: dataset[name] = data[name] # Saving file to HDF5 format hdf5_obj = h5py.File(hdf5_file, mode=mode) hdf5_obj.create_dataset(key, data=dataset) hdf5_obj.close() msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file)
def read_hdf5_file_to_pandas_DF(hdf5_file, key=None): """ Reads content of HDF5 file and converts it to a Pandas DataFrame Parameters ---------- hdf5_file : str Path to the HDF5 file. This is the file that will be converted to a pandas DataFrame. key : str or NoneType, optional Key or path in `hdf5_file` for the pandas DataFrame and the normal HDF5 file. Returns ---------- df : `pandas.DataFrame` DataFrame from `hdf5_file` under the `key` directory. """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(hdf5_file) # Reading in Pandas DataFrame try: df = pd.read_hdf(hdf5_file, key=key) except: msg = '{0} Could not read `hdf5_file` ({1})! Please check if it exists' msg = msg.format(file_msg, hdf5_file) raise LSSUtils_Error(file_msg) return df
def read_pandas_hdf5(hdf5_file, key=None, ret=False): """ Reads a HDF5 file that contains one or many datasets. It converts it into a pandas DataFrame. Parameters ---------- hdf5_file : str Path to the HDF5 file containing one or more pandas DataFrame(s). key : str or NoneType If provided, it will extract the `key` value as a pandas DataFrame. This value is set to `None` by default. ret : boolean, optional If True, it returns the value of the `key`. By default, it is set to False. Returns ---------- df : `pandas.DataFrame` DataFrame from the `hdf5_file` with the data from the `key` directory """ file_msg = Program_Msg(__file__) # Checking that file exists fd.File_Exists(hdf5_file) # Checking number of keys hdf5_obj = pd.HDFStore(hdf5_file) hdf5_keys = [ii for ii in hdf5_obj.keys()] hdf5_obj.close() # Reading in HDF5 file if key == None: try: df = pd.read_hdf(hdf5_file) if ret: return df, hdf5_keys[0] else: return df except: msg = '{0} Must specify which key to use:\n\t'.format(file_msg) msg += 'Possible keys: \n' print(msg) for key_i, name in enumerate(hdf5_keys): print('\t Key {0}: {1}'.format(key_i, name)) else: if key not in hdf5_keys: print('{0} Key not in the file: '.format(file_msg)) print('Possible Keys:\n') for key_i, name in enumerate(hdf5_keys): print('\t Key {0}: {1}'.format(key_i, name)) else: df = pd.read_hdf(hdf5_file, key=key) if ret: return df, key else: return df
def concatenate_pd_df(directory, filetype='hdf5', foutput=None, outonly=True): """ Concatenates pandas DataFrames into a single DataFrame Parameters ---------- directory : str Path to the folder containing multiple pandas-HDF5 files filetype : str, optional File format of the file in `directory` to be read This is set to `hdf5` by default. foutput : str or NoneType If not `None`, it is the basename of the output file in HDF5 format outonly : boolean, optional If True, it returns the pandas DataFrame. If False, it only saved the concatenated `pandas.DataFrame`. Returns ---------- df_conc : `pandas.DataFrame` DataFrame containing the combined datasets from the files in `directory`. Raises ---------- LSSUtils_Error : Exception If no files are found in `directory`, it raises an error warning about this. """ file_msg = fd.Program_Msg(__file__) # Checking that `directory` exists if not os.path.exists(directory): msg = '{0} `directory` {1} is not a valid path! Exiting!'.format( file_msg, directory) raise LSSUtils_Error(msg) # Concatenating files files_arr = df.index(directory, '.' + filetype, sort=True) print('{0} Found `{1}` files'.format(file_msg, files_arr.size)) if len(files_arr) > 0: # Initializing array that contains info df_arr = [[] for x in range(len(files_arr))] # Looping over HDF5 (pandas) files for ii, file_ii in enumerate(files_arr): df_arr[ii] = read_pandas_hdf5(file_ii) # Concatenating arrays df_conc = pd.concat(df_arr, ignore_index=True) # Deciding name of resulting output file if (foutput is not None) and (type(foutput) == str): foutput_file = os.path.join(directory, '{0}.{1}'.format(foutput, filetype)) # Saving resulting DataFrame pandas_df_to_hdf5_file(df_conc, foutput_file, key='/Main') # Checking file exists fd.File_Exists(foutput_file) print('{0} Output file saved in: {2}'.format( file_msg, foutput_file)) # If only outputting concatenated DataFrame if outonly: return df_conc else: msg = '{0} No files in `{1}` with extension `{2}`'.format( file_msg, directory, filetype) raise LSSUtils_Error(msg)
def catl_sdss_merge(catl_pd_ii, catl_kind='data', catl_type='mr', sample_s='19', halotype='fof', clf_method=3, hod_n=0, clf_seed=1235, perf_opt=False, return_memb_group=False, print_filedir=False): """ Merges the member and group catalogues for a given set of input parameters, and returns a modified version of the galaxy group catalogues with added info about the galaxy groups. Parameters ------------ catl_pd_ii : int Index of the catalogue to match, from :func:`~cosmoutils.mock_catalogues.catls_utils.extract_catls` function. catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_type : {'mr', 'mstar'} str, optional Type of catalogue to use. It shows which abundance matching method was used for the CLF when assigning halo masses. This variable is set to 'mr' by default. Options: - `mr` : Uses r-band absolute magnitude - `mstar` : Uses stellar masses sample_s : {'19', '20', '21'} str, optional Volume-limited sample to use. This variable is set to '19' by default. Options: - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo' - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda' - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen' halotype : {'fof', 'so'} str, optional Type of the dark matter halo of the simulation used to create the synthetic catalogues. This variable is set to `fof` by default. Options: - 'fof': Friends-of-Friends halos. - 'so' : Spherical overdensity halos. clf_method : {1, 2, 3} int, optional Method for assigning galaxy properties to mock galaxies. This variable is set to `3` by default. Options: - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr) - `2` : (g-r) decides active/passive designation and draw values independently. - `3` : (g-r) decides active/passive designations, and assigns other galaxy properties for that given galaxy. hod_n : {0, 1} int, optional HOD model to use. Only relevant when `catl_kind == mocks`. clf_seed : int, optional Seed used for the `CLF` random seed. This variable is set to `1235` by default. perf_opt : boolean, optional If True, it chooses to analyze the `perfect` set of synthetic catalogues. This variable is set to `False` by default. return_memb_group : `bool`, optional If True, the function returns the member and group catalogues, along with the merged catalogue. It returns ``<memb_group_pd, memb_pd, group_pd>`` print_filedir : boolean, optional If True, the output directory is printed onto the screen. Return ------------ memb_group_pd : `pandas.DataFrame` Combined version of the i-th member and group catalogues. It contains both galaxy and group information. memb_pd : `pandas.DataFrame` Catalogue of the member galaxies of the i-th catalogue. This catalogue contains information of the `member galaxies`. group_pd : `pandas.DataFrame` Catalogue of the groups of the i-th catalogue. This catalogue contains information of the `galaxy groups`. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_pd_ii_valid = (float, int) catl_kind_valid = ['data', 'mocks'] catl_type_valid = ['mr', 'mstar'] sample_s_valid = ['19', '20', '21'] catl_info_valid = ['members', 'groups'] halotype_valid = ['fof', 'so'] clf_method_valid = [1, 2, 3] hod_n_valid = [0, 1] # `catl_pd_ii` if (isinstance(catl_pd_ii, catl_pd_ii_valid)): catl_pd_ii = int(catl_pd_ii) else: msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, type(catl_kind)) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_type` if not (catl_type in catl_type_valid): msg = '{0} `catl_type` ({1}) is not a valid input!'.format( file_msg, catl_type) raise LSSUtils_Error(msg) # `sample_s` if not (sample_s in sample_s_valid): msg = '{0} `sample_s` ({1}) is not a valid input!'.format( file_msg, sample_s) raise LSSUtils_Error(msg) # `halotype` if not (halotype in halotype_valid): msg = '{0} `halotype` ({1}) is not a valid input!'.format( file_msg, halotype) raise LSSUtils_Error(msg) # `clf_method` if not (clf_method in clf_method_valid): msg = '{0} `clf_method` ({1}) is not a valid input!'.format( file_msg, clf_method) raise LSSUtils_Error(msg) # `hod_n` if not (hod_n in hod_n_valid): msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # `return_memb_group` if not (isinstance(return_memb_group, bool)): msg = '{0} `return_memb_group` ({1}) is not a valid type!'.format( file_msg, type(return_memb_group)) raise LSSUtils_Error(msg) # `print_filedir` if not (isinstance(print_filedir, bool)): msg = '{0} `print_filedir` ({1}) is not a valid type!'.format( file_msg, type(print_filedir)) raise LSSUtils_Error(msg) ## ## Extracting catalogues given input parameters (memb_arr, memb_len) = extract_catls(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, perf_opt=perf_opt, catl_info='members', return_len=True, print_filedir=print_filedir) # Checking number of catalogues if catl_pd_ii > (memb_len - 1): msg = '{0} `catl_pd_ii` ({1}) is OUT of range ({2})!'.format( file_msg, catl_pd_ii, memb_len) raise LSSUtils_Error(msg) ## ## Extracting group catalogue # i-th Galaxy catalogue memb_path = memb_arr[catl_pd_ii] # i-th Galaxy Group catalogue group_path = catl_sdss_dir(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, perf_opt=perf_opt, catl_info='groups', print_filedir=print_filedir) ## ## Paths to catalogue # Mocks if catl_kind == 'mocks': group_path += os.path.basename(memb_path).replace('memb', 'group') # Data if catl_kind == 'data': group_path += os.path.basename(memb_path).replace('Gals', 'Group') # Checking that file exists fd.File_Exists(group_path) ## ## Reading in Catalogues memb_pd = fr.read_hdf5_file_to_pandas_DF(memb_path) group_pd = fr.read_hdf5_file_to_pandas_DF(group_path) ## Keys for the catalogues (gm_key, id_key, galtype_key) = catl_keys(catl_kind, perf_opt=perf_opt, return_type='list') ## Matching keys from Group catalogue if len(np.unique(memb_pd[id_key])) == len(np.unique(group_pd[id_key])): # Group column names group_colnames = np.sort(group_pd.columns.values) group_groupid = np.sort(np.unique(group_pd[id_key])) n_groups = len(group_groupid) n_memb = len(memb_pd) ## Sorting `memb_pd` by `id_key` # Member catalogue memb_pd.sort_values(by=id_key, inplace=True) memb_pd.reset_index(inplace=True, drop=True) # Group catalogue group_pd.sort_values(by=id_key, inplace=True) group_pd.reset_index(inplace=True, drop=True) ## Renaming columns g_colnames_dict = {ii: 'GG' + ii for ii in group_colnames} group_pd.rename(columns=g_colnames_dict, inplace=True) group_pd.rename(columns={'GG' + id_key: id_key}, inplace=True) ## ## Merging the 2 DataFrames memb_group_pd = pd.merge(left=memb_pd, right=group_pd, how='left', left_on=id_key, right_on=id_key) else: msg = '{0} Lengths of the 2 DataFrames (`memb_pd`, `group_pd`) ' msg += 'do not match!' msg = msg.format(file_msg) raise LSSUtils_Error(msg) ## ## Returning DataFrames if return_memb_group: return_obj = (memb_group_pd, memb_pd, group_pd) else: return_obj = memb_group_pd return return_obj