def read_hdf5_file_to_pandas_DF(hdf5_file, key=None): """ Reads content of HDF5 file and converts it to a Pandas DataFrame Parameters ---------- hdf5_file : str Path to the HDF5 file. This is the file that will be converted to a pandas DataFrame. key : str or NoneType, optional Key or path in `hdf5_file` for the pandas DataFrame and the normal HDF5 file. Returns ---------- df : `pandas.DataFrame` DataFrame from `hdf5_file` under the `key` directory. """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(hdf5_file) # Reading in Pandas DataFrame try: df = pd.read_hdf(hdf5_file, key=key) except: msg = '{0} Could not read `hdf5_file` ({1})! Please check if it exists' msg = msg.format(file_msg, hdf5_file) raise LSSUtils_Error(file_msg) return df
def get_parser(): """ Get parser object for `eco_mocks_create.py` script. Returns ------- args: input arguments to the script """ ## Define parser object description_msg = 'Description of Script' parser = ArgumentParser( description=description_msg, formatter_class=SortingHelpFormatter, ) ## parser.add_argument('--version', action='version', version='%(prog)s 1.0') ## Program message parser.add_argument('-progmsg', dest='Prog_msg', help='Program message to use throught the script', type=str, default=cfutils.Program_Msg(__file__)) ## Parsing Objects args = parser.parse_args() return args
def pandas_df_to_hdf5_file(df, hdf5_file, key=None, mode='w', complevel=8): """ Saves a `pandas.DataFrame` into a `pandas` HDF5 FILE. Parameters ---------- df : `pandas.DataFrame` DataFrame to be converted and saved into a HDF5 file. hdf5_file : str Path to the output HDF5 file key : str or NoneType, optional Key or path, under which `df` will be saved in the `hdf5_file`. mode : {'w','a'}, optional Mode to handle `hdf5_file`. This value is set to `w` by default, which stand for `write`. complevel : int, optional Level of compression for `hdf5_file`. The range of `complevel` is rane(0-9). This is set to a default of 8. """ file_msg = fd.Program_Msg(__file__) # Saving DataFrame to `hdf5_file` try: df.to_hdf(hdf5_file, key, mode=mode, complevel=complevel) msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file) print(msg) except: msg = '{0} Could not create HDF5 file'.format(file_msg) raise LSSUtils_Error(msg)
def url_checker(url_str): """ Checks if the URL is valid or not. Parameters ----------- url_str : `str` URL of the website to evaluate. Raises ---------- LSSUtils_Error : `Exception` Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters if not (isinstance(url_str, str)): msg = '{0} `url_str` ({1}) is not a STRING!'.format( file_msg, type(url_str)) raise LSSUtils_Error(msg) ## ## Checking Website request_url = requests.get(url_str) if (request_url.status_code != 200): msg = '{0} `url_str` ({1}) does not exist!'.format(file_msg, url_str) raise LSSUtils_Error(msg)
def IDL_read_file(idl_file): """ Reads an IDL file and converts it to a Python dictionary Parameters ---------- idl_file : string Path to the filename being used Returns ---------- idl_dict : python dictionary Dictionary with the data from `idl_file` """ # Checking that file exists fd.File_Exists(idl_file) # Converting to dictionary try: idl_dict = readsav(idl_file, python_dict=True) except: msg = '{0} `idl_file` {0} is not an IDL file'.format( fd.Program_Msg(__file__), idl_file) raise LSSUtils_Error(msg) return idl_dict
def cookiecutter_paths(path='./'): """ Paths to main folders in the `Data Science` cookiecutter template. This structure was taken from : - https://drivendata.github.io/cookiecutter-data-science/ Parameters ---------- path : str, optional Path to the file within the `.git` repository Return ---------- param_dict : python dictionary Dictionary with info of the project that uses the Data Science cookiecutter template. Raises ---------- LSSUtils_Error : exception If `path` is not within a .git directory, it raises an error. """ # Base Path base_dir = git_root_dir(path) + '/' # Checking that directory exists if os.path.exists(base_dir): # Plot Directory plot_dir = os.path.join(base_dir, 'reports', 'figures/') # Source directory src_dir = os.path.join(base_dir, 'src', 'data/') # Data path data_dir = os.path.join(base_dir, 'data/') # External path ext_dir = os.path.join(data_dir, 'external/') # Processed path proc_dir = os.path.join(data_dir, 'processed/') # External path int_dir = os.path.join(data_dir, 'interim/') # External path raw_dir = os.path.join(data_dir, 'raw/') # Creating files for dir_ii in [plot_dir, src_dir, data_dir]: fd.Path_Folder(dir_ii) # Saving to dictionary param_dict = {} param_dict['base_dir'] = base_dir param_dict['plot_dir'] = plot_dir param_dict['src_dir'] = src_dir param_dict['data_dir'] = data_dir param_dict['ext_dir'] = ext_dir param_dict['proc_dir'] = proc_dir param_dict['int_dir'] = int_dir param_dict['raw_dir'] = raw_dir else: msg = '{0} `base_dir` ({1}) is not a Git directory! Exiting'.format( fd.Program_Msg(__file__), base_dir) raise LSSUtils_Error(msg) return param_dict
def url_file_list(url, ext): """ Lists the files from a URL taht have a specific file extension. Parameters ----------- url : `str` String of the URL ext : `str` File extension of the files in the URL. Returns ----------- files_arr : `numpy.ndarray`, shape (N,) Array of the file in `url` that match the file extension `ext`. """ file_msg = fd.Program_Msg(__file__) ## Checking for file type # 'URL' if not isinstance(url, str): msg = '{0} `url` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(url)) raise TypeError(msg) # File extension if not isinstance(ext, str): msg = '{0} `ext` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(ext)) raise TypeError(msg) ## Reformatting URL # Removing whitespaces url = url.strip() # Removing trailing slach if url.endswith('/'): url = url[:-1] # Checking if URL exists url_checker(url) # Reading in HTML from page page = requests.get(url).text # Converting to BeautifulSoup format soup = BeautifulSoup(page, 'html.parser') ## Obtaining list of files # Removing files that are NOT strings files_arr_pre = np.array([ xx.get('href') for xx in soup.find_all('a') if isinstance(xx.get('href'), str) ]) # Only those finishing with certain extension files_pre_ext = np.array([xx for xx in files_arr_pre if xx.endswith(ext)]) # Checking if file contains string 'http://' files_pre_web = np.array([(url + '/' + xx) if not ('//' in xx) else xx for xx in files_pre_ext]) # Sorting out file array files_arr = np.sort(files_pre_web) return files_arr
def read_pandas_hdf5(hdf5_file, key=None, ret=False): """ Reads a HDF5 file that contains one or many datasets. It converts it into a pandas DataFrame. Parameters ---------- hdf5_file : str Path to the HDF5 file containing one or more pandas DataFrame(s). key : str or NoneType If provided, it will extract the `key` value as a pandas DataFrame. This value is set to `None` by default. ret : `bool`, optional If True, it returns the value of the `key`. By default, it is set to False. Returns ---------- df : `pandas.DataFrame` DataFrame from the `hdf5_file` with the data from the `key` directory """ file_msg = fd.Program_Msg(__file__) # Checking that file exists fd.File_Exists(hdf5_file) # Checking number of keys hdf5_obj = pd.HDFStore(hdf5_file) hdf5_keys = [ii for ii in hdf5_obj.keys()] hdf5_obj.close() # Reading in HDF5 file if key is None: try: df = pd.read_hdf(hdf5_file) if ret: return df, hdf5_keys[0] else: return df except: msg = '{0} Must specify which key to use:\n\t'.format(file_msg) msg += 'Possible keys: \n' print(msg) for key_i, name in enumerate(hdf5_keys): print('\t Key {0}: {1}'.format(key_i, name)) else: if key not in hdf5_keys: print('{0} Key not in the file: '.format(file_msg)) print('Possible Keys:\n') for key_i, name in enumerate(hdf5_keys): print('\t Key {0}: {1}'.format(key_i, name)) else: df = pd.read_hdf(hdf5_file, key=key) if ret: return df, key else: return df
def get_parser(): """ Get parser object for `eco_mocks_create.py` script. Returns ------- args: input arguments to the script """ ## Define parser object description_msg = 'Downloads the necessary catalogues from the web' parser = ArgumentParser(description=description_msg, formatter_class=SortingHelpFormatter,) ## parser.add_argument('--version', action='version', version='%(prog)s 1.0') # Type of survey parser.add_argument('-survey', dest='survey', help='Type of survey to produce. Choices: A, B, ECO', type=str, choices=['A','B','ECO'], default='ECO') ## CPU Counts parser.add_argument('-cpu', dest='cpu_frac', help='Fraction of total number of CPUs to use', type=float, default=0.75) ## Option for removing file parser.add_argument('-remove', dest='remove_files', help=""" Delete files from previous analyses with same parameters """, type=_str2bool, default=False) ## Program message parser.add_argument('-progmsg', dest='Prog_msg', help='Program message to use throught the script', type=str, default=cfutils.Program_Msg(__file__)) ## Verbose parser.add_argument('-v','--verbose', dest='verbose', help='Option to print out project parameters', type=_str2bool, default=False) ## Parsing Objects args = parser.parse_args() return args
def reversed_arrays(x, y): """ Determines if arrays increase or decrease monotonically. Parameters ----------- x : `numpy.ndarray` Array containing the 1st set of values y : `numpy.ndarray` Array containing the 2nd set of values. Return ----------- mono_opt : `bool` If True, `x` increases monotonically with increasing `y`. If False, `x` decreases monotonically with increasing `y`. Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) # Testing input arguments # x-array valid_types = (list, np.ndarray) if not (isinstance(x, valid_types)): msg = '{0} `x` is not a valid type!'.format(file_msg, type(x)) raise LSSUtils_Error(msg) # y-array valid_types = (list, np.ndarray) if not (isinstance(y, valid_types)): msg = '{0} `y` is not a valid type!'.format(file_msg, type(y)) raise LSSUtils_Error(msg) # x- and y-array shapes x = np.asarray(x) y = np.asarray(y) # # Checking if arrays increase or decrease monotonically x_diff = np.diff(x).sum() y_diff = np.diff(y).sum() # Monotonically increasing or decreasing if (x_diff > 0) and (y_diff > 0): mono_opt = True else: mono_opt = False return mono_opt
def luminosity_to_absolute_mag(lum, filter_opt, system='SDSS_Blanton_2003_z0.1'): """ Calculates the absolute magnitude of object through the `filter_opt` filter. Parameters ----------- lum : float, int, array_like Luminosity of 1 or more objects. In units of `solar luminosities`. filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str Magnitude filter to use. system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str Kind of filter to use. Options: - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998 - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14. Returns ----------- abs_mag : float, int, or array_like Absolute magnitude of one or multiple objects. Same type as `lum` Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters valid_types = (float, int, list, np.ndarray) if not (isinstance(lum, valid_types)): msg = '{0} `lum` ({1}) is not a valid type!'.format(file_msg, lum) raise LSSUtils_Error(msg) ## Obtaining Sun's absolute magnitude abs_mag_sun = get_sun_mag(filter_opt, system=system) ## Absolute magnitude calculation # In units of solar luminosities lum_sun = 1.0 # Absolute magnitude of objects abs_mag = abs_mag_sun - 2.5 * np.log10(lum / lum_sun) return abs_mag
def get_parser(): """ Get parser object for `eco_mocks_create.py` script. Returns ------- args: input arguments to the script """ ## Define parser object description_msg = 'Description of Script' parser = ArgumentParser( description=description_msg, formatter_class=SortingHelpFormatter, ) ## parser.add_argument('--version', action='version', version='%(prog)s 1.0') parser.add_argument('-namevar', '--long-name', dest='variable_name', help='Description of variable', type=float, default=0) ## parser.add_argument('-namevar1', '--long-name1', dest='variable_name1', help='Description of variable', type=_check_pos_val, default=0.1) ## `Perfect Catalogue` Option parser.add_argument('-namevar2', '--long-name2', dest='variable_name2', help='Description of variable', type=_str2bool, default=False) ## Program message parser.add_argument('-progmsg', dest='Prog_msg', help='Program message to use throught the script', type=str, default=cfutils.Program_Msg(__file__)) ## Parsing Objects args = parser.parse_args() return args
def Bins_array_create(arr, base=10, return_tuple=False): """ Generates an evenly-spaced array between the minimum and maximum value of a given array, Parameters ---------- arr : array_like Array of of numbers or floats base : `int` or `float`, optional Interval used to create the evenly-spaced array of elements return_tuple : `bool`, optional If `True`, the function returns a set of tuples for each bin. This variable is set to `False` by default. Returns ---------- bins_arr : `numpy.ndarray` Array of elements separated in intervals of `base` """ file_msg = fd.Program_Msg(__file__) # Transforming input data base = float(base) arr = np.asarray(arr) # Checking array dimensions if arr.ndim != 1: msg = '{0} The input array is not of dimension 1, but of `{1}`'.format( file_msg, arr.ndim) raise LSSUtils_Error(msg) # Creating evenly-spaced array arr_min = myfloor(arr.min(), base=base) arr_max = myceil(arr.max(), base=base) bins_arr = np.arange(arr_min, arr_max + 0.5 * base, base) # Creating tuple if necessary if return_tuple: bins_arr_mod = (np.array([[bins_arr[ii], bins_arr[ii + 1]] for ii in range(len(bins_arr) - 1)])) return_obj = bins_arr_mod else: return_obj = bins_arr return return_obj
def absolute_magnitude_to_luminosity(abs_mag, filter_opt, system='SDSS_Blanton_2003_z0.1'): """ Calculates the luminosity of the object through `filter_opt` filter. Parameters ----------- abs_mag : float, int, or array_like Absolute magnitude of one or multiple objects. filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str Magnitude filter to use. system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str Kind of filter to use. Options: - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998 - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14. Returns ----------- log_L : float or array_like Logarithmic value of the luminosity in the `filter_opt` band. Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters valid_types = (float, int, list, np.ndarray) if not (isinstance(abs_mag, valid_types)): msg = '{0} `abs_mag` ({1}) is not a valid type!'.format( file_msg, abs_mag) raise LSSUtils_Error(msg) ## Obtaining Sun's absolute magnitude abs_mag_sun = get_sun_mag(filter_opt, system=system) ## Luminosity calculations log_L = (abs_mag_sun - abs_mag) * 0.4 return log_L
def reshape_arr_1d(arr): """ Transforms the array intoa 1-dimensional array, if necessary. Parameters ----------- arr : `numpy.ndarray` or array-like Array to be converted into 1-dimensional array. Returns ----------- arr_new : `numpy.ndarray` or array-like Converted array into 1-dimensional array if needed. """ file_msg = fd.Program_Msg(__file__) # Checking input parameters arr_valid_types = (list, np.ndarray) # `arr` if not (isinstance(arr, arr_valid_types)): msg = '{0} `arr` ({1}) is not a valid input type!'.format(file_msg, type(arr)) raise TypeError(msg) # Dimensions if (isinstance(arr, arr_valid_types)): if not (np.asarray(arr).ndim in [1, 2]): msg = '{0} The shape of `arr` ({1}) can only have 1 or 2 ' msg += 'dimensions' msg = msg.format(file_msg, np.asarray(arr).ndim) raise LSSUtils_Error(msg) # Converting to Numpy array arr = np.asarray(arr) # Trying to reshape it if (arr.ndim == 2): if (arr.shape[1] == 1): arr = arr.reshape(len(arr),) return arr
def __init__(self, argname, argvalues): """ Initializes class object. Parameters ---------- argname : `str` Key of the element to change in main dictionary. It can only contain 1 word at a time. argvalues : array-like List of argvalues for each of the `argnames`. This list will be used to loop over the values and replace them into the main dictionary. Notes ---------- This function loops over the many different elements in `argvalues`. This function is meant to be used as a `decorator` for some function whose input a dictionary. """ file_msg = fd.Program_Msg(__file__) ## Check input parameters # `argname` if not (isinstance(argname, str)): msg = '{0} `argname` ({1}) must be a string'.format( file_msg, type(argname)) raise TypeError(msg) # `argvalues` if not (isinstance(argvalues, (tuple, list))): msg = '{0} `argvalues` ({1}) must be a tuple or list'.format( file_msg, type(argvalues)) raise TypeError(msg) ## Assigning to class variables self.argname = argname self.argvalues = argvalues self.file_msg = file_msg
def pandas_file_to_hdf5_file(df_file, hdf5_file, key=None, mode='w'): """ Converts a HDF5 with pandas format and converts it to normal HDF5 file Paramters --------- df_file : str Path to the `df_file` containing the pandas DataFrame to be converted hdf5_file : str Path to the output HDF5 file containg arrays as keys key : str or NoneType, optional Key or path in HDF5 file for the `df_file` and `hdf5_file` """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(df_file) # Reading in DataFrame if not key: data, key = read_pandas_hdf5(df_file, key=None, ret=True) else: data = read_pandas_hdf5(df_file, key=key) # Rearranging data arr_names = data.dtypes.index.values dtypes_arr = data.dtypes.values dtypes_arr = np.array([x.str for x in dtypes_arr]) data_dtypes = np.dtype(zip(arr_names, dtypes_arr)) dataset = np.recarray((len(data), ), dtype=data_dtypes) for name in dataset.dtype.names: dataset[name] = data[name] # Saving file to HDF5 format hdf5_obj = h5py.File(hdf5_file, mode=mode) hdf5_obj.create_dataset(key, data=dataset) hdf5_obj.close() msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file) print(msg)
def array_insert(arr1, arr2, axis=1): """ Joins the two arrays into a `single` multi-dimensional array. Parameters ------------ arr1 : `numpy.ndarray` 1st array to merge arr2 : `numpy.ndarray` 2nd array to merge axis : `axis object` Axis to use for the merging Returns ---------- arr_merged : `numpy.ndarray` Merged array from `arr1` and `arr2`. """ file_msg = fd.Program_Msg(__file__) # Checking input parameters arr_valid_types = (list, np.ndarray) # `arr1` if not (isinstance(arr1, arr_valid_types)): '{0} `arr1` ({1}) is not array-like!'.format(file_msg, type(arr1)) raise ValueError(msg) # `arr2` if not (isinstance(arr2, arr_valid_types)): '{0} `arr2` ({1}) is not array-like!'.format(file_msg, type(arr2)) raise ValueError(msg) # # Merging arrays arr_merged = np.insert(arr1, len(arr1.T), arr2, axis=axis) return arr3
def sdss_catl_clean(catl_pd, catl_kind, catl_info='members', reindex=True): """ Cleans the catalogue by removing `failed` values. Parameters ----------- catl_pd : `pandas.DataFrame` Dataset with the catalogue information. catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. reindex : `bool`, optional If True, the output catalogue is re-indexed. Return ----------- catl_pd_clean : `pandas.DataFrame` Cleaned version of `catl_pd`, after having removed `failed` values. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) # Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_info_valid = ['members', 'groups'] # `catl_pd` if not (isinstance(catl_pd, pd.DataFrame)): msg = '{0} `catl_pd` ({1}) is not a valid type!'.format( file_msg, catl_pd) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `reindex if not (isinstance(reindex, bool)): msg = '{0} `reindex` ({1}) is not a valid type!'.format( file_msg, type(reindex)) raise LSSUtils_Error(msg) # # Defining `failed` values ssfr_fail_arr = [0, -99, -999, np.nan] mstar_fail_arr = [-1, 0, np.nan] # # Getting keys for catalogues (logssfr_key, logmstar_key) = catl_keys_prop(catl_kind=catl_kind, catl_info=catl_info, return_type='list') # # Cleaning catalogue entries # # Data if catl_kind == 'data': # Clean version catl_pd_clean = catl_pd[~catl_pd[logssfr_key].isin(ssfr_fail_arr) & \ ~catl_pd[logmstar_key].isin(mstar_fail_arr)] # Mocks if catl_kind == 'mocks': # Clean version catl_pd_clean = catl_pd[~catl_pd[logssfr_key].isin(ssfr_fail_arr)] # # Reindexing if reindex: catl_pd_clean.reset_index(inplace=True, drop=True) return catl_pd_clean
def train_test_dataset(pred_arr, feat_arr, pre_opt='min_max', shuffle_opt=True, random_state=0, test_size=0.25, reshape=False, return_idx=False): """ Function to create the training and testing datasets for a given set of features array and predicted array. Parameters ----------- pred_arr : `pandas.DataFrame` `numpy.ndarray` or array-like, shape (n_samples, n_outcomes) Array consisting of the `predicted values`. The dimensions of `pred_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the number of observations, and `n_outcomes` the number of predicted outcomes. feat_arr : `numpy.ndarray`, `pandas.DataFrame` or array-like, shape (n_samples, n_features) Array consisting of the `predicted values`. The dimensions of `feat_arr` are `n_samples` by `n_features`, where `n_samples` is the number of observations, and `n_features` the number of features used. pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional Type of preprocessing to do on `feat_arr`. Options: - 'min_max' : Turns `feat_arr` to values between (0,1) - 'standard' : Uses `sklearn.preprocessing.StandardScaler` method - 'normalize' : Uses the `sklearn.preprocessing.Normalizer` method - 'no' : No preprocessing on `feat_arr` shuffle_opt : `bool`, optional If True, the data is shuffled before splitting into testing and training datasets. This variable is set to True by default. random_state : int, optional Random state number used for when splitting into training and testing datasets. If set, it will always have the same seed `random_state`. This variable is set to `0` by default. test_size : float, optional Percentage of the catalogue that represents the `test` size of the testing dataset. This variable must be between (0,1). This variable is set to `0.25` by default. reshape : `bool`, optional If True, it reshapes `feat_arr` into a 1d array if its shapes is equal to (ncols, 1), where `ncols` is the number of columns. This variable is set to `False` by default. return_idx : `bool`, optional If `True`, it returns the indices of the `training` and `testing` datasets. This variable is set to `False` by default. Returns ----------- train_dict : `dict` Dictionary containing the `training` data from the catalogue. test_dict : `dict` Dictionary containing the `testing` data from the catalogue. See also ----------- data_preprocessing : Function to preprocess a dataset. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `pred_arr` pred_arr_type_valid = (list, np.ndarray, pd.DataFrame) if not (isinstance(pred_arr, pred_arr_type_valid)): msg = '{0} `pred_arr` ({1}) is not a valid input type'.format( file_msg, type(pred_arr)) raise LSSUtils_Error(msg) # `feat_arr` feat_arr_type_valid = (list, np.ndarray, pd.DataFrame) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `pre_opt` pre_opt_valid = ['min_max', 'standard', 'normalize', 'no'] if not (pre_opt in pre_opt_valid): msg = '{0} `pre_opt` ({1}) is not a valid input'.format( file_msg, pre_opt) raise LSSUtils_Error(msg) # `shuffle_opt` shuffle_opt_type_valid = (bool) if not (isinstance(shuffle_opt, shuffle_opt_type_valid)): msg = '{0} `shuffle_opt` ({1}) is not a valid input type'.format( file_msg, type(shuffle_opt)) raise LSSUtils_Error(msg) # `random_state` random_state_type_valid = (int) if not (isinstance(random_state, random_state_type_valid)): msg = '{0} `random_state` ({1}) is not a valid input'.format( file_msg, random_state) raise LSSUtils_Error(msg) # `test_size` if not ((test_size > 0) and (test_size < 1.)): msg = '{0} `test_size` ({1}) must be in range (0,1)'.format( file_msg, test_size) raise LSSUtils_Error(msg) ## ## Checking indices of `pred_arr` and `feat_arr` if return_idx: # If object is a DataFrame if (isinstance(pred_arr, pd.DataFrame) and isinstance(feat_arr, pd.DataFrame)): pred_arr_idx = pred_arr.index.values feat_arr_idx = feat_arr.index.values else: pred_arr_idx = np.arange(len(pred_arr)) feat_arr_idx = np.arange(len(feat_arr)) # Reshaping if necessary if reshape: pred_arr_idx = gu.reshape_arr_1d(pred_arr_idx) feat_arr_idx = gu.reshape_arr_1d(feat_arr_idx) ## ## Checking dimensions of `pred_arr` and `feat_arr` pred_arr = np.asarray(pred_arr) feat_arr = np.asarray(feat_arr) # Dimensions if reshape: pred_arr = gu.reshape_arr_1d(pred_arr) feat_arr = gu.reshape_arr_1d(feat_arr) # Shape if (len(pred_arr) != len(feat_arr)): msg = '{0} The shape of `pred_arr` ({1}) and `feat_arr` ({2}) must ' msg += 'have the same length' msg = msg.format(file_msg, len(pred_arr), len(feat_arr)) raise LSSUtils_Error(msg) ## ## Rescaling Dataset feat_arr_scaled = data_preprocessing(feat_arr, pre_opt=pre_opt, reshape=reshape) ## ## Splitting into `Training` and `Testing` datasets. # Scaled (X_train, X_test, Y_train, Y_test) = skms.train_test_split(feat_arr_scaled, pred_arr, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) # Not-scaled (X_train_ns, X_test_ns, Y_train_ns, Y_test_ns) = skms.train_test_split(feat_arr, pred_arr, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) # Returning indices if necessary if return_idx: # Splitting to `training` and `testing` (X_train_idx, X_test_idx, Y_train_idx, Y_test_idx) = skms.train_test_split(feat_arr_idx, pred_arr_idx, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) if not (np.array_equal(X_train_idx, Y_train_idx) and np.array_equal(X_test_idx, Y_test_idx)): msg = '{0} Index arrays are not equal to each other!' raise LSSUtils_Error(msg) ## ## Assigning `training` and `testing` datasets to dictionaries # Saving indices if necessary if return_idx: # Adding 'indices' to dictionaries train_dict = { 'X_train': X_train, 'Y_train': Y_train, 'X_train_ns': X_train_ns, 'Y_train_ns': Y_train_ns, 'train_idx': X_train_idx } test_dict = { 'X_test': X_test, 'Y_test': Y_test, 'X_test_ns': X_test_ns, 'Y_test_ns': Y_test_ns, 'test_idx': X_test_idx } else: train_dict = { 'X_train': X_train, 'Y_train': Y_train, 'X_train_ns': X_train_ns, 'Y_train_ns': Y_train_ns } test_dict = { 'X_test': X_test, 'Y_test': Y_test, 'X_test_ns': X_test_ns, 'Y_test_ns': Y_test_ns } return train_dict, test_dict
def scoring_methods(truth_arr, feat_arr=None, pred_arr=None, model=None, score_method='perc', threshold=0.1, perc=0.68): """ Determines the overall score for given arrays, i.e. the `predicted` array and the `truth` array Parameters ----------- truth_arr : `numpy.ndarray` or array-like, shape (n_samples, n_outcomes) Array consisting of the `true` values for the `n_samples` observations. The dimensions of `truth_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the number of observations, and `n_outcomes` the number of predicted outcomes. feat_arr : `numpy.ndarray`, array-like, or `NoneType`, shape (n_samples, n_features) Array consisting of the `predicted values`. The dimensions of `feat_arr` are `n_samples` by `n_features`, where `n_samples` is the number of observations, and `n_features` the number of features used. This variable is set to `None` by default. pred_arr : `numpy.ndarray`, array-like, or `NoneType`, shape (n_samples, n_outcomes) Array of predicted values from `feat_arr`. If ``model == None``, this variable must be an array-like object. If ``model != None``, this variable will not be used, and will be calculated using the `model` object. This variable is set to `None` by default. model : scikit-learn model object or `NoneType` Model used to estimate the score if ``score_method == 'model_score'`` This variable is set to `None` by default. score_method : {'perc', 'threshold', 'model_score', 'r2'} `str`, optional Type of scoring to use when determining how well an algorithm is performing. Options: - 'perc' : Use percentage and rank-ordering of the values - 'threshold' : Score based on diffs of `threshold` or less from true value. - 'model_score' : Out-of-the-box metod from `sklearn` to determine success. - 'r2': R-squared statistic for error calcuation. threshold : float, optional Value to use when calculating the error within `threshold` value from the truth. This variable is set to `None` by default. If `None`, this variable assumes a value of `0.1`. perc : float, optional Value used when determining score within some `perc` percentile value form [0,1]. This variable is set to `None` by default. If `None`, it assumes a value of `0.68`. Returns ----------- method_score : float Overall score from `pred_arr` to predict `truth_arr`. Notes ----------- For more information on how to pre-process your data, see `http://scikit-learn.org/stable/modules/model_evaluation.html`_. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `feat_arr` feat_arr_type_valid = (list, np.ndarray, type(None)) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `truth_arr` truth_arr_type_valid = (list, np.ndarray) if not (isinstance(truth_arr, truth_arr_type_valid)): msg = '{0} `truth_arr` ({1}) is not a valid input type'.format( file_msg, type(truth_arr)) raise LSSUtils_Error(msg) # `score_method` - Type score_method_type_valid = (str) if not (isinstance(score_method, score_method_type_valid)): msg = '{0} `score_method` ({1}) is not a valid input type'.format( file_msg, type(score_method)) raise LSSUtils_Error(msg) # `score_method` - Value score_method_valid = ['perc', 'threshold', 'model_score', 'r2'] if not (score_method in score_method_valid): msg = '{0} `score_method` ({1}) is not a valid input!'.format( file_msg, score_method) raise LSSUtils_Error(score_method) # `threshold` - Type threshold_valid = (float, int) if not (isinstance(threshold, threshold_valid)): msg = '{0} `threshold` ({1}) is not a valid input type'.format( file_msg, type(threshold)) raise LSSUtils_Error(msg) # `threshold` - Value if not (threshold >= 0.): msg = '{0} `threshold` ({1}) must be larger than 0!'.format( file_msg, threshold) raise LSSUtils_Error(msg) ## ## Checking for `model`, `pred_arr` and `feat_arr` # If both are none if ((model is None) and (pred_arr is None)): msg = '{0} `model` and `pred_arr` cannot both be `None`. ' msg += 'Only one can be `None`' msg = msg.format(file_msg) raise LSSUtils_Error(msg) # If `feat_arr` and `pred_arr` are `None` if ((feat_arr is None) and (pred_arr is None)): msg = '{0} `feat_arr` and `pred_arr` cannot both be `None`'.format( file_msg) raise TypeError(msg) # `pred_arr` - Type # If both are `None` pred_arr_valid = ((list, np.ndarray)) if (model is None): if not (isinstance(pred_arr, pred_arr_valid)): msg = '{0} `pred_arr` ({1}) is not a valid input type!'.format( file_msg, type(pred_arr)) raise LSSUtils_Error(msg) ## ## Choosing scoring method # Percentile method if (score_method == 'perc'): # Checking for `pred_arr` if (pred_arr is None): pred_arr = model.predict(feat_arr) # Checking for `model` if (model is None): pred_arr = np.asarray(pred_arr) # Error calcualtion pred_err = np.abs(pred_arr - truth_arr) method_score = scipy.stats.scoreatpercentile(pred_err, 100. * perc) # Threshold method if (score_method == 'threshold'): # Checking for `pred_arr` if (pred_arr is None): pred_arr = model.predict(feat_arr) # Checking for `model` if (model is None): pred_arr = np.asarray(pred_arr) # Error calcualtion pred_err = np.abs(pred_arr - truth_arr) pred_thresh = len(pred_err[pred_err <= threshold]) method_score = pred_thresh / len(pred_arr) # R-squared method if (score_method == 'r2'): # Checking for `pred_arr` if (pred_arr is None): pred_arr = model.predict(feat_arr) # Checking for `model` if (model is None): pred_arr = np.asarray(pred_arr) # Error calcualtion method_score = skmetrics.r2_score(truth_arr, pred_arr) # Model method if (score_method == 'model_score'): method_score = model.score(feat_arr, truth_arr) return method_score
def get_parser(): """ Get parser object for `eco_mocks_create.py` script. Returns ------- args: input arguments to the script """ ## Define parser object description_msg = 'Main analysis of the `Red Sequence` project.' parser = ArgumentParser( description=description_msg, formatter_class=SortingHelpFormatter, ) ## parser.add_argument('--version', action='version', version='%(prog)s 1.0') ## 1st Magnitude band parser.add_argument('-mband_1', dest='mband_1', help='First apparent magnitude band to analyze.', type=str, choices=[ 'mag_auto_g', 'mag_auto_r', 'mag_auto_i', 'mag_auto_z', 'mag_auto_y' ], default='mag_auto_g') ## 2nd Magnitude band parser.add_argument('-mband_2', dest='mband_2', help='Second apparent magnitude band to analyze.', type=str, choices=[ 'mag_auto_g', 'mag_auto_r', 'mag_auto_i', 'mag_auto_z', 'mag_auto_y' ], default='mag_auto_z') ## 3rd Magnitude band parser.add_argument('-mband_3', dest='mband_3', help='Third apparent magnitude band to analyze.', type=str, choices=[ 'mag_auto_g', 'mag_auto_r', 'mag_auto_i', 'mag_auto_z', 'mag_auto_y' ], default='mag_auto_i') ## Maximum difference between `mband_1` and `mband_2` parser.add_argument('-mag_diff_tresh', dest='mag_diff_tresh', help=""" Maximum threshold of the difference between `mband_1` and `mband_2`. It must be larger than `4`. """, type=_check_pos_val, default=4.) ## Bottom magnitude limit for `mband_1` and `mband_2.` parser.add_argument('-mag_min', dest='mag_min', help=""" Bottom magnitude limit for `mband_1` and `mband_2`. """, type=float, default=24.) ## Upper magnitude limit for `mband_1` and `mband_2.` parser.add_argument('-mag_max', dest='mag_max', help=""" Upper magnitude limit for `mband_1` and `mband_2`. """, type=float, default=17.) ## Maximum number of elements to download parser.add_argument('-master_limit', dest='master_limit', help='Number of elements to use for the MASTER file', type=int, default=100000) ## Aperture radius in 'arcseconds' parser.add_argument('-radius_size', dest='radius_size', help='Size of radius on the Sky. In units of `arcsec`', type=_check_pos_val, default=5.) ## Cosmology Choice parser.add_argument( '-cosmo', dest='cosmo_choice', help='Choice of Cosmology', type=str, choices=['WMAP5', 'WMAP7', 'WMAP9', 'Planck15', 'custom'], default='WMAP7') ## Redshift bin size parser.add_argument('-z_binsize', dest='z_binsize', help='Size of bin for redsift `z`', type=_check_pos_val, default=0.0125) ## Minimum redshift value parser.add_argument('-z_min', dest='z_min', help='Minimim redshift to analyze.', type=_check_pos_val, default=0.4) ## Minimum redshift value parser.add_argument('-z_max', dest='z_max', help='Maximum redshift to analyze.', type=_check_pos_val, default=1.0) ## Choice of the input galaxy cluster location parser.add_argument('-input_catl_loc', dest='input_catl_loc', help='Choice of the input galaxy cluster location.', type=str, choices=['RedMapper', 'SDSS'], default='RedMapper') ## Choice of binning parser.add_argument('-hist_nbins', dest='hist_nbins', help='Number of bins for x- and y-axis.', type=_check_pos_val, default=200) ## Option for removing file parser.add_argument('-remove', dest='remove_files', help=""" Delete files from previous analyses with same parameters """, type=_str2bool, default=False) ## Program message parser.add_argument('-progmsg', dest='Prog_msg', help='Program message to use throught the script', type=str, default=cfutils.Program_Msg(__file__)) ## Verbose parser.add_argument('-v', '--verbose', dest='verbose', help='Option to print out project parameters', type=_str2bool, default=False) ## Parsing Objects args = parser.parse_args() return args
def catl_sdss_merge(catl_pd_ii, catl_kind='data', catl_type='mr', sample_s='19', halotype='fof', clf_method=3, hod_n=0, clf_seed=1235, dv=1.0, sigma_clf_c=0.1417, perf_opt=False, return_memb_group=False, print_filedir=False): """ Merges the member and group catalogues for a given set of input parameters, and returns a modified version of the galaxy group catalogues with added info about the galaxy groups. Parameters ------------ catl_pd_ii : `int` Index of the catalogue to match, from :func:`~cosmo_utils.mock_catalogues.catls_utils.extract_catls` function. catl_kind : {'data', 'mocks'} `str`, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_type : {'mr', 'mstar'} `str`, optional Type of catalogue to use. It shows which abundance matching method was used for the CLF when assigning halo masses. This variable is set to 'mr' by default. Options: - `mr` : Uses r-band absolute magnitude - `mstar` : Uses stellar masses sample_s : {'19', '20', '21'} str, optional Volume-limited sample to use. This variable is set to '19' by default. Options: - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo' - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda' - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen' halotype : {'fof', 'so'} str, optional Type of the dark matter halo of the simulation used to create the synthetic catalogues. This variable is set to `fof` by default. Options: - 'fof': Friends-of-Friends halos. - 'so' : Spherical overdensity halos. clf_method : {1, 2, 3} int, optional Method for assigning galaxy properties to mock galaxies. This variable is set to `3` by default. Options: - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr) - `2` : (g-r) decides active/passive designation and draw values independently. - `3` : (g-r) decides active/passive designations, and assigns other galaxy properties for that given galaxy. hod_n : {0, 1} int, optional HOD model to use. Only relevant when `catl_kind == mocks`. clf_seed : int, optional Seed used for the `CLF` random seed. This variable is set to `1235` by default. dv : float, optional Difference between galaxy and mass velocity profiles (v_g-v_c)/(v_m-v_c). This value is set to `1.0` by default. sigma_clf_c : `float`, optional Value of the scatter in log(L) for central galaxies in the CLF. This variable is set to ``0.1417`` by default. perf_opt : `bool`, optional If True, it chooses to analyze the `perfect` set of synthetic catalogues. This variable is set to `False` by default. return_memb_group : `bool`, optional If True, the function returns the member and group catalogues, along with the merged catalogue. It returns ``<memb_group_pd, memb_pd, group_pd>`` print_filedir : `bool`, optional If True, the output directory is printed onto the screen. Return ------------ memb_group_pd : `pandas.DataFrame` Combined version of the i-th member and group catalogues. It contains both galaxy and group information. memb_pd : `pandas.DataFrame` Catalogue of the member galaxies of the i-th catalogue. This catalogue contains information of the `member galaxies`. group_pd : `pandas.DataFrame` Catalogue of the groups of the i-th catalogue. This catalogue contains information of the `galaxy groups`. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_pd_ii_valid = (float, int, np.int64, np.int32, np.float32, np.float64) catl_kind_valid = ['data', 'mocks'] catl_type_valid = ['mr', 'mstar'] sample_s_valid = ['19', '20', '21'] halotype_valid = ['fof', 'so'] clf_method_valid = [1, 2, 3] hod_n_valid = np.arange(0, 20) # `catl_pd_ii` if (isinstance(catl_pd_ii, catl_pd_ii_valid)): catl_pd_ii = int(catl_pd_ii) else: msg = '{0} `catl_pd_ii` ({1}) is not a valid input!'.format( file_msg, type(catl_pd_ii)) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_type` if not (catl_type in catl_type_valid): msg = '{0} `catl_type` ({1}) is not a valid input!'.format( file_msg, catl_type) raise LSSUtils_Error(msg) # `sample_s` if not (sample_s in sample_s_valid): msg = '{0} `sample_s` ({1}) is not a valid input!'.format( file_msg, sample_s) raise LSSUtils_Error(msg) # `halotype` if not (halotype in halotype_valid): msg = '{0} `halotype` ({1}) is not a valid input!'.format( file_msg, halotype) raise LSSUtils_Error(msg) # `clf_method` if not (clf_method in clf_method_valid): msg = '{0} `clf_method` ({1}) is not a valid input!'.format( file_msg, clf_method) raise LSSUtils_Error(msg) # `dv` if not (dv > 0): msg = '{0} `dv` ({1}) must be larger than 0!'.format(file_msg, dv) raise LSSUtils_Error(msg) # `sigma_clf_c` - Type if not (isinstance(sigma_clf_c, float)): msg = '{0} `sigma_clf_c` ({1}) is not a valid input type!' msg = msg.format(file_msg, type(sigma_clf_c)) raise LSSUtils_Error(msg) # `sigma_clf_c` - Value if not (sigma_clf_c >= 0.): msg = '{0} `sigma_clf_c` ({1}) must be larger than 0!' msg = msg.format(file_msg, sigma_clf_c) raise LSSUtils_Error(msg) # `hod_n` if not (hod_n in hod_n_valid): msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # `return_memb_group` if not (isinstance(return_memb_group, bool)): msg = '{0} `return_memb_group` ({1}) is not a valid type!'.format( file_msg, type(return_memb_group)) raise LSSUtils_Error(msg) # `print_filedir` if not (isinstance(print_filedir, bool)): msg = '{0} `print_filedir` ({1}) is not a valid type!'.format( file_msg, type(print_filedir)) raise LSSUtils_Error(msg) # # Extracting catalogues given input parameters (memb_arr, memb_len) = extract_catls(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, dv=dv, sigma_clf_c=sigma_clf_c, perf_opt=perf_opt, catl_info='members', return_len=True, print_filedir=print_filedir) # Checking number of catalogues if catl_pd_ii > (memb_len - 1): msg = '{0} `catl_pd_ii` ({1}) is OUT of range ({2})!'.format( file_msg, catl_pd_ii, memb_len) raise LSSUtils_Error(msg) # # Extracting group catalogue # i-th Galaxy catalogue memb_path = memb_arr[catl_pd_ii] # i-th Galaxy Group catalogue group_path = catl_sdss_dir(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, halotype=halotype, clf_method=clf_method, dv=dv, sigma_clf_c=sigma_clf_c, hod_n=hod_n, clf_seed=clf_seed, perf_opt=perf_opt, catl_info='groups', print_filedir=print_filedir) # # Paths to catalogue # Mocks if catl_kind == 'mocks': group_path = os.path.join( group_path, os.path.basename(memb_path).replace('memb', 'group')) # Data if catl_kind == 'data': group_path = os.path.join( group_path, os.path.basename(memb_path).replace('Gals', 'Group')) # Checking that file exists fd.File_Exists(group_path) ## ## Reading in Catalogues memb_pd = fr.read_hdf5_file_to_pandas_DF(memb_path) group_pd = fr.read_hdf5_file_to_pandas_DF(group_path) ## Keys for the catalogues (gm_key, id_key, galtype_key) = catl_keys(catl_kind, perf_opt=perf_opt, return_type='list') ## Matching keys from Group catalogue if len(np.unique(memb_pd[id_key])) == len(np.unique(group_pd[id_key])): # Group column names group_colnames = np.sort(group_pd.columns.values) ## Sorting `memb_pd` by `id_key` # Member catalogue memb_pd.sort_values(by=id_key, inplace=True) memb_pd.reset_index(inplace=True, drop=True) # Group catalogue group_pd.sort_values(by=id_key, inplace=True) group_pd.reset_index(inplace=True, drop=True) ## Renaming columns g_colnames_dict = {ii: 'GG_' + ii for ii in group_colnames} group_pd.rename(columns=g_colnames_dict, inplace=True) group_pd.rename(columns={'GG_' + id_key: id_key}, inplace=True) ## ## Merging the 2 DataFrames memb_group_pd = pd.merge(left=memb_pd, right=group_pd, how='left', left_on=id_key, right_on=id_key) else: msg = '{0} Lengths of the 2 DataFrames (`memb_pd`, `group_pd`) ' msg += 'do not match!' msg = msg.format(file_msg) raise LSSUtils_Error(msg) ## ## Returning DataFrames if return_memb_group: return_obj = (memb_group_pd, memb_pd, group_pd) else: return_obj = memb_group_pd return return_obj
def concatenate_pd_df(directory, filetype='hdf5', foutput=None, outonly=True): """ Concatenates pandas DataFrames into a single DataFrame Parameters ---------- directory : str Path to the folder containing multiple pandas-HDF5 files filetype : str, optional File format of the file in `directory` to be read This is set to `hdf5` by default. foutput : str or NoneType If not `None`, it is the basename of the output file in HDF5 format outonly : `bool`, optional If True, it returns the pandas DataFrame. If False, it only saved the concatenated `pandas.DataFrame`. Returns ---------- df_conc : `pandas.DataFrame` DataFrame containing the combined datasets from the files in `directory`. Raises ---------- LSSUtils_Error : Exception If no files are found in `directory`, it raises an error warning about this. """ file_msg = fd.Program_Msg(__file__) # Checking that `directory` exists if not os.path.exists(directory): msg = '{0} `directory` {1} is not a valid path! Exiting!'.format( file_msg, directory) raise LSSUtils_Error(msg) # Concatenating files files_arr = fd.index(directory, '.' + filetype, sort=True) print('{0} Found `{1}` files'.format(file_msg, files_arr.size)) if len(files_arr) > 0: # Initializing array that contains info df_arr = [[] for x in range(len(files_arr))] # Looping over HDF5 (pandas) files for ii, file_ii in enumerate(files_arr): df_arr[ii] = read_pandas_hdf5(file_ii) # Concatenating arrays df_conc = pd.concat(df_arr, ignore_index=True) # Deciding name of resulting output file if (foutput is not None) and (type(foutput) == str): foutput_file = os.path.join(directory, '{0}.{1}'.format(foutput, filetype)) # Saving resulting DataFrame pandas_df_to_hdf5_file(df_conc, foutput_file, key='/Main') # Checking file exists fd.File_Exists(foutput_file) print('{0} Output file saved in: {2}'.format( file_msg, foutput_file)) # If only outputting concatenated DataFrame if outonly: return df_conc else: msg = '{0} No files in `{1}` with extension `{2}`'.format( file_msg, directory, filetype) raise LSSUtils_Error(msg)
def url_files_download(url, ext, outdir, check_exist=False, create_dir=False, remove_files=False, bar_opt='tqdm'): """ Downloads the files from a URL to a local directory. The files that match a specific file extension, `ext`. Parameters ----------- url : `str` String of the URL ext : `str` File extension of the files in the URL. outdir : `str` Path to the output directory. This is the directory, to which the files with extensions `ext` will be saved. check_exist : `bool`, optional If `True`, it checks for whether or not the file exists. This variable is set to `False` by default. create_dir : `bool`, optional If `True`, it creates the directory if it does not exist. This variable is set to `False` by default. remove_files : `bool`, optional If `True`, local files that are present that match the files at the URL will be replaced by the new versions. This variable is set to ``False`` by default. bar_opt : {'tqdm', 'native'} Option for which type of progress bar to use when downloading files. This variable is set to `tqdm` by default. Options: - 'tqdm' : Uses a tqdm-based progress bar - 'native': Used the wget-based native progress bar. """ file_msg = fd.Program_Msg(__file__) ## Checking for file type # 'URL' if not isinstance(url, str): msg = '{0} `url` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(url)) raise TypeError(msg) # File extension if not isinstance(ext, str): msg = '{0} `ext` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(ext)) raise TypeError(msg) # Output directory if not isinstance(outdir, str): msg = '{0} `outdir` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(outdir)) raise TypeError(msg) # `check_exist` if not (isinstance(check_exist, bool)): msg = '`check_exist` ({0}) must be of `boolean` type!'.format( type(check_exist)) raise TypeError(msg) # `create_dir` if not (isinstance(create_dir, bool)): msg = '`create_dir` ({0}) must be of `boolean` type!'.format( type(create_dir)) raise TypeError(msg) # `bar` - Type if not (isinstance(bar_opt, str)): msg = '`bar_opt` ({0}) must be of `boolean` type!'.format( type(bar_opt)) raise TypeError(msg) # Progress bar - Value if not (bar_opt in ['tqdm', 'native']): msg = '{0} `bar_opt` ({1}) is not a valid option! Exiting' msg = msg.format(file_msg, bar_opt) raise LSSUtils_Error(msg) ## ## List of files in the URL files_arr = url_file_list(url, ext) # Creating directory if create_dir: cfutils.Path_Folder(outdir) # Check for its existence if check_exist: if not (os.path.exists(outdir)): msg = '`outdir` ({0}) was not found!'.format(outdir) raise FileNotFoundError(msg) ## ## Downloading files to output directory if len(files_arr) > 0: if (bar_opt == 'tqdm'): tqdm_desc = 'Downloading files: ' for file_ii in tqdm(files_arr, desc=tqdm_desc): # Local file file_ii_local = os.path.join(outdir, os.path.basename(file_ii)) # Checking if local file exists if os.path.exists(file_ii_local): if remove_files: os.remove(file_ii_local) wget_opt = True else: wget_opt = False else: wget_opt = True ## ## Only downloading if necessary if wget_opt: wget.download(file_ii, out=outdir, bar=None) elif (bar_opt == 'native'): for file_ii in files_arr: # Local file file_ii_local = os.path.join(outdir, os.path.basename(file_ii)) # Checking if local file exists if os.path.exists(file_ii_local): if remove_files: os.remove(file_ii_local) wget_opt = True else: wget_opt = False else: wget_opt = True ## ## Only downloading if necessary if wget_opt: wget.download(file_ii, out=outdir) else: msg = '{0} Number of files is ZERO!'.format(file_msg) print(msg)
def abundance_matching_f(dict1, dict2, volume1=1., volume2=1., reverse=True, dens1_opt=False): """ Abundance matching based on 2 quantities. It assigns values from `dict2` to elements in `dict1` Parameters ----------- dict1 : python dictionary or `numpy.ndarray` Dictionary or array of 1st property. Keys : - `var` : 1st variable to be analysed - `dens` : Density array corresponding to `var` elements. Only if `dens` == True. dict2 : python dictionary dictionary or array of the 2nd property. Keys : - `var` : 2nd variable to be analyzed - `dens` : Density array corresponding to `var` elements. Given if `dens` == True. volume1 : float Corresponding volume to `dict1`. reverse : `bool`, optional Determines the relation between `var1` and `var2`. dens1_opt : `bool`, optional If True, `density` must be calculated. Options : - `True` : Density is already provided as key for `dict1`. - `False` : Density must be calculated. Returns ----------- var1_ab : `numpy.ndarray` Array of elements matching those of `dict1`, after matching with `dict2`. """ file_msg = fd.Program_Msg(__file__) # Check types of input paramenters valid_types = (list, dict, np.ndarray) # `dict1` if not (isinstance(dict1, valid_types)): msg = '{0} `dict1` ({1}) is not a valid type!'.format( file_msg, type(dict1)) # `dict2` if not (isinstance(dict2, dict)): msg = '{0} `dict2` must be a dictionary. Its type is `{1}`'.format( file_msg, type(dict2)) raise LSSUtils_Error(msg) # 2nd property var2 = np.asarray(dict2['var']) dens2 = np.asarray(dict2['dens']) # # `dens1_opt` if dens1_opt: # 1st Property var1 = np.asarray(dict1['var']) dens_1 = np.asarray(dict1['dens']) else: if (isinstance(dict1, dict)): var1 = dict1['var'] elif (isinstance(dict1, (list, np.ndarray))): var1 = dict1.copy() # # Determining relation between `var1` and `var2` mono_opt_1 = reversed_arrays(var1, var2) # Monotonically increasing if mono_opt_1: counts_1 = np.array([np.where(var1 > x)[0].size for x in var1]) + 1 else: counts_1 = np.array([np.where(var1 < x)[0].size for x in var1]) + 1 # # Determining density of 1st property dens_1 = counts_1.astype(float) / volume1 # # Interpolation for 2nd property var2_interp = interp1d(dens2, var2, bounds_error=True, assume_sorted=False) # Assigning values to property 1 var1_ab = np.asarray([var2_interp(xx) for xx in dens_1]) return var1_ab
def Behroozi_relation(log_mstar, z=0.): """ Returns the halo mass of a central galaxy as a function of its stellar mass. Parameters ----------- log_mstar : `float` ,`np.ndarray`, or array-like Value or array of values of base-10 logarithm of stellar mass in h=1 solar mass units. z : int, float, `np.ndarray` or array-like Redshift of the halo hosting the galaxy. If passing an array, it must be of the same length as the input `log_mstar`. Returns ----------- log_halo_mass : float or `np.ndarray` Array or float containing 10-base logarithm of halo mass in ``h=1`` solar mass units. Notes ---------- The parameter values in Behroozi+10 were fit to data assuming ``h=0.7``. Thus, we will transform our input stellar mass to ``h=0.7`` units, evaluate using the Behroozi parameters, and then transform back to ``h=1`` units before returning the result. """ file_msg = fd.Program_Msg(__file__) little_h = 0.7 ## Checking input parameters # `log_mstar` mstar_valid_types = (int, float, np.ndarray, list) if not (isinstance(log_mstar, mstar_valid_types)): msg = '{0} `log_mstar` ({1}) is not a valid type!'.format( file_msg, type(log_mstar)) raise TypeError(msg) ## ## Behroozi dictionary param_dict = _retrieve_Behroozi_default_dict() ## COnverting stellar mass from ``h=1`` units to ``h=0.7`` units. mstar = (10.**log_mstar) / (little_h**2) ## Scale factor a = 1. / (1. + z) ## ## Behroozi function logm0 = param_dict['smhm_m0_0'] + param_dict['smhm_m0_a'] * (a - 1.) m0 = 10.**logm0 logm1 = param_dict['smhm_m1_0'] + param_dict['smhm_m1_a'] * (a - 1) beta = param_dict['smhm_beta_0'] + param_dict['smhm_beta_a'] * (a - 1) delta = param_dict['smhm_delta_0'] + param_dict['smhm_delta_a'] * (a - 1) gamma = param_dict['smhm_gamma_0'] + param_dict['smhm_gamma_a'] * (a - 1) # stellar_mass_by_m0 = mstar / m0 term3_numerator = (stellar_mass_by_m0)**delta term3_denominator = 1. + (stellar_mass_by_m0)**(-gamma) # log_halo_mass = logm1 + beta * np.log10(stellar_mass_by_m0) log_halo_mass += (term3_numerator / term3_denominator) - 0.5 # # Convert back from ``h=0.7`` to ``h=1`` units return np.log10((10.**log_halo_mass) * (little_h))
def data_preprocessing(feat_arr, pre_opt='min_max', reshape=False): """ Preprocess the data used, in order to clean and make the data more suitable for the machine learning algorithms Parameters ----------- feat_arr : `numpy.ndarray`, `list`, `pandas.DataFrame` Array of feature values. This array is used for training a ML algorithm. pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional Type of preprocessing to do on `feat_arr`. Options: - 'min_max' : Turns `feat_arr` to values between (0,1) - 'standard' : Uses `~sklearn.preprocessing.StandardScaler` method - 'normalize' : Uses the `~sklearn.preprocessing.Normalizer` method - 'no' : No preprocessing on `feat_arr` reshape : `bool`, optional If True, it reshapes `feat_arr` into a 1d array if its shapes is equal to (ncols, 1), where `ncols` is the number of columns. This variable is set to `False` by default. Returns ----------- feat_arr_scaled : `numpy.ndarray` Rescaled version of `feat_arr` based on the choice of `pre_opt`. Notes ----------- For more information on how to pre-process your data, see `http://scikit-learn.org/stable/modules/preprocessing.html`_. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `feat_arr` feat_arr_type_valid = (list, np.ndarray, pd.DataFrame) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `pre_opt` pre_opt_valid = ['min_max', 'standard', 'normalize', 'no'] if not (pre_opt in pre_opt_valid): msg = '{0} `pre_opt` ({1}) is not a valid input'.format( file_msg, pre_opt) raise LSSUtils_Error(msg) ## ## Reshaping `feat_arr` if reshape: feat_arr = gu.reshape_arr_1d(feat_arr) ## ## Scaling `feat_arr` if (pre_opt == 'min_max'): # Scaler scaler = skpre.MinMaxScaler(feature_range=(0, 1)) # Rescaling feat_arr_scaled = scaler.fit_transform(feat_arr) ## Standardize Data if pre_opt == 'standard': # Scaler scaler = skpre.StandardScaler().fit(feat_arr) # Rescaling feat_arr_scaled = scaler.transform(feat_arr) ## Normalize Data if pre_opt == 'normalize': # Scaler scaler = skpre.Normalizer().fit(feat_arr) # Rescaling feat_arr_scaled = scaler.transform(feat_arr) ## No Preprocessing if pre_opt == 'no': feat_arr_scaled = feat_arr return feat_arr_scaled
def extract_catls(catl_kind='data', catl_type='mr', sample_s='19', datatype='.hdf5', catl_info='members', halotype='fof', clf_method=3, hod_n=0, clf_seed=1235, dv=1.0, sigma_clf_c=0.1417, perf_opt=False, return_len=False, print_filedir=True): """ Extracts a list of synthetic catalogues given input parameters Parameters ------------ catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_type : {'mr', 'mstar'} str, optional Type of catalogue to use. It shows which abundance matching method was used for the CLF when assigning halo masses. This variable is set to 'mr' by default. Options: - `mr` : Uses r-band absolute magnitude - `mstar` : Uses stellar masses sample_s : {'19', '20', '21'} str, optional Volume-limited sample to use. This variable is set to '19' by default. Options: - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo' - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda' - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen' datatype : {'.hdf5'} str, optional Data type of the files to be indexed in the folder. This variable is set to '.hdf5' by default. catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. halotype : {'fof', 'so'} str, optional Type of the dark matter halo of the simulation used to create the synthetic catalogues. This variable is set to `fof` by default. Options: - 'fof': Friends-of-Friends halos. - 'so' : Spherical overdensity halos. clf_method : {1, 2, 3} int, optional Method for assigning galaxy properties to mock galaxies. This variable is set to `3` by default. Options: - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr) - `2` : (g-r) decides active/passive designation and draw values independently. - `3` : (g-r) decides active/passive designations, and assigns other galaxy properties for that given galaxy. hod_n : {0, 1} int, optional HOD model to use. Only relevant when `catl_kind == mocks`. clf_seed : int, optional Seed used for the `CLF` random seed. This variable is set to `1235` by default. dv : float, optional Difference between galaxy and mass velocity profiles (v_g-v_c)/(v_m-v_c). This value is set to `1.0` by default. sigma_clf_c : `float`, optional Value of the scatter in log(L) for central galaxies in the CLF. This variable is set to ``0.1417`` by default. perf_opt : `bool`, optional If True, it chooses to analyze the `perfect` set of synthetic catalogues. This variable is set to `False` by default. return_len : `bool`, optional If True, the function returns the total number of elements in the folder that match the criteria. print_filedir : `bool`, optional If True, the output directory is printed onto the screen. Returns ------------ catl_arr : `numpy.ndarray` Array of elements/files matching the `datatype` type in the directory. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) # Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_type_valid = ['mr', 'mstar'] sample_s_valid = ['19', '20', '21'] catl_info_valid = ['members', 'groups'] halotype_valid = ['fof', 'so'] clf_method_valid = [1, 2, 3] hod_n_valid = np.arange(0, 20) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_type` if not (catl_type in catl_type_valid): msg = '{0} `catl_type` ({1}) is not a valid input!'.format( file_msg, catl_type) raise LSSUtils_Error(msg) # `sample_s` if not (sample_s in sample_s_valid): msg = '{0} `sample_s` ({1}) is not a valid input!'.format( file_msg, sample_s) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `halotype` if not (halotype in halotype_valid): msg = '{0} `halotype` ({1}) is not a valid input!'.format( file_msg, halotype) raise LSSUtils_Error(msg) # `clf_method` if not (clf_method in clf_method_valid): msg = '{0} `clf_method` ({1}) is not a valid input!'.format( file_msg, clf_method) raise LSSUtils_Error(msg) # `hod_n` if not (hod_n in hod_n_valid): msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # `print_filedir` if not (isinstance(print_filedir, bool)): msg = '{0} `print_filedir` ({1}) is not a valid type!'.format( file_msg, type(print_filedir)) raise LSSUtils_Error(msg) # `dv` if not (dv > 0): msg = '{0} `dv` ({1}) must be larger than 0!'.format(file_msg, dv) raise LSSUtils_Error(msg) # `sigma_clf_c` - Type if not (isinstance(sigma_clf_c, float)): msg = '{0} `sigma_clf_c` ({1}) is not a valid input type!' msg = msg.format(file_msg, type(sigma_clf_c)) raise LSSUtils_Error(msg) # `sigma_clf_c` - Value if not (sigma_clf_c >= 0.): msg = '{0} `sigma_clf_c` ({1}) must be larger than 0!' msg = msg.format(file_msg, sigma_clf_c) raise LSSUtils_Error(msg) # `return_len` if not (isinstance(return_len, bool)): msg = '{0} `return_len` ({1}) is not a valid type!'.format( file_msg, type(return_len)) raise LSSUtils_Error(msg) # `datatype` if not (isinstance(datatype, str)): msg = '{0} `datatype` ({1}) is not a valid type!'.format( file_msg, type(datatype)) raise LSSUtils_Error(msg) # # Extracting the path of the catalogues filedir = catl_sdss_dir(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, catl_info=catl_info, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, dv=dv, sigma_clf_c=sigma_clf_c, perf_opt=perf_opt, print_filedir=print_filedir) # # Converting to array catl_arr = np.sort(fd.Index(filedir, datatype)) # Checking number of elements if len(catl_arr) == 0: msg = '{0} `catl_arr` contains 0 entries!'.format(file_msg) raise LSSUtils_Error(msg) # # Returning elements if return_len: return catl_arr, len(catl_arr) else: return catl_arr
def sdss_catl_clean_nmin(catl_pd, catl_kind, catl_info='members', nmin=1, perf_opt=False): """ Cleans the catalogue removing `failed` values, and only includes galaxies that are in groups/halos above a `nmin` threshold. Parameters ----------- catl_pd : `pandas.DataFrame` Dataset with the catalogue information. catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. nmin : int, optional Minimum group richness to have in the (galaxy) group catalogue. This variable is set to `1` by default. perf_opt : `bool`, optional Option for using a `perfect` mock catalogue. Return ----------- catl_pd_clean : `pandas.DataFrame` Cleaned version of `catl_pd` after having removed `failed` values, and having choosen only galaxies within groups above a group richness threshold of `nmin`. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) # Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_info_valid = ['members', 'groups'] # `catl_pd` if not (isinstance(catl_pd, pd.DataFrame)): msg = '{0} `catl_pd` ({1}) is not a valid type!'.format( file_msg, catl_pd) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `nmin` if not ((nmin > 0) and (isinstance(nmin, int))): msg = '{0} `nmin` must be an integer and have a value above `0`' msg = msg.format(file_msg) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # # Types of galaxies cens = int(1) nmin = int(nmin) # # Getting keys for catalogue (gm_key, id_key, galtype_key) = catl_keys(catl_kind, return_type='list', perf_opt=perf_opt) # Cleaning catalogue entries catl_pd_clean_all = sdss_catl_clean(catl_pd, catl_kind=catl_kind, catl_info=catl_info, reindex=True) # Choosing only galaxies in groups of richness >= `nmin` # Member galaxies if catl_info == 'members': # Centrals catl_pd_cens = catl_pd_clean_all.loc[( catl_pd_clean_all[galtype_key] == cens), id_key] catl_pd_cl = catl_pd_clean_all[( catl_pd_clean_all[id_key].isin(catl_pd_cens))] # Group counts group_counts = Counter(catl_pd_cl[id_key]) group_ngals = np.array( [xx for xx in group_counts.keys() if group_counts[xx] >= nmin]) # Cleaned version catl_pd_clean = catl_pd_cl[catl_pd_cl[id_key].isin(group_ngals)] catl_pd_clean.reset_index(inplace=True, drop=True) # Group catalogue if catl_info == 'groups': if ('ngals' in catl_pd_clean_all.columns.tolist()): catl_pd_clean = catl_pd_clean_all.loc[ catl_pd_clean_all['ngals'] >= nmin] catl_pd_clean.reset_index(inplace=True, drop=True) else: msg = '{0} Key `ngals` not found in DataFrame ... Exiting!' msg = msg.format(file_msg) raise LSSUtils_Error(msg) return catl_pd_clean