def paretorank(costs, sense=None, distinct=True, use_numba=True): """Return integer array with Pareto ranks of (non-NaN) numerical data. Observations in the Pareto set are assigned rank 1. After removing the Pareto set, the Pareto set of the remaining data is assigned rank 2, and so forth. The input data in `costs` can be either a pandas DataFrame or a NumPy ndarray of shape (observations, objectives). The user is responsible for dealing with NaN values *before* sending data to this function. Only numerical data is allowed, with the exception of `diff` (different) columns. Parameters ---------- costs : np.ndarray or pd.DataFrame Array or DataFrame of shape (observations, objectives). sense : list List with strings for each column (objective). The value `min` (default) indicates minimization, `max` indicates maximization and `diff` indicates different values. Using `diff` is equivalent to a group-by operation over the columns marked with `diff`. If None, minimization is assumed. distinct : bool How to treat duplicate rows. If `True`, only the first duplicate is returned. If `False`, every identical observation is returned instead. use_numba : bool If True, numba will be used if it is installed by the user. Returns ------- ranks : np.ndarray Integer array with Pareto ranks of the observations. Examples -------- >>> from paretoset import paretoset >>> import numpy as np >>> costs = np.array([[2, 0], [1, 1], [0, 2], [3, 3]]) >>> paretorank(costs) array([1, 1, 1, 2]) >>> paretorank(costs, sense=["min", "max"]) array([3, 2, 1, 1]) The `distinct` parameter: >>> paretorank([0, 0], distinct=True) array([1, 2]) >>> paretorank([0, 0], distinct=False) array([1, 1]) """ if user_has_package("numba") and use_numba: paretorank_algorithm = pareto_rank_naive else: paretorank_algorithm = pareto_rank_naive costs, sense = validate_inputs(costs=costs, sense=sense) assert isinstance(sense, list) n_costs, n_objectives = costs.shape diff_cols = [i for i in range(n_objectives) if sense[i] == "diff"] max_cols = [i for i in range(n_objectives) if sense[i] == "max"] min_cols = [i for i in range(n_objectives) if sense[i] == "min"] # Check data types (MIN and MAX must be numerical) message = "Data must be numerical. Please convert it. Data has type: {}" if isinstance(costs, pd.DataFrame): data_types = [costs.dtypes.values[i] for i in (max_cols + min_cols)] if any(d == np.dtype("O") for d in data_types): raise TypeError(message.format(data_types)) else: if costs.dtype == np.dtype("O"): raise TypeError(message.format(costs.dtype)) # CASE 1: THE ONLY SENSE IS MINIMIZATION # --------------------------------------- if all(s == "min" for s in sense): if isinstance(costs, pd.DataFrame): costs = costs.to_numpy(copy=True) return paretorank_algorithm(costs, distinct=distinct, use_numba=use_numba) n_costs, n_objectives = costs.shape if not diff_cols: # Its an array if not isinstance(costs, np.ndarray): costs = costs.to_numpy(copy=True) for col in max_cols: costs[:, col] = -costs[:, col] return paretorank_algorithm(costs, distinct=distinct, use_numba=use_numba) if isinstance(costs, pd.DataFrame): df = costs.copy() # Copy to avoid mutating inputs df.columns = np.arange(n_objectives) else: df = pd.DataFrame(costs) assert isinstance(df, pd.DataFrame) assert np.all(df.columns == np.arange(n_objectives)) # If `object` columns are present and they can be converted, do it. for col in max_cols + min_cols: df[col] = pd.to_numeric(df[col], errors="coerce") all_ranks = np.zeros(n_costs, dtype=np.int_) # Create the groupby object # We could've implemented our own groupby, but choose to use pandas since # it's likely better than what we can come up with on our own. groupby = df.groupby(diff_cols) # Iteration through the groups for key, data in groupby: # Get the relevant data for the group and compute the efficient points relevant_data = data[max_cols + min_cols].to_numpy(copy=True) ranks = paretorank_algorithm(relevant_data.copy(), distinct=distinct, use_numba=use_numba) # The `pd.DataFrame.groupby.indices` dict holds the row indices of the group data_mask = groupby.indices[key] all_ranks[data_mask] = ranks return all_ranks
import numpy as np from paretoset.algorithms_numpy import paretoset_efficient, pareto_rank_naive from paretoset.utils import user_has_package, validate_inputs import pandas as pd if user_has_package("numba"): from paretoset.algorithms_numba import BNL def paretoset(costs, sense=None, distinct=True, use_numba=True): """Return boolean mask indicating the Pareto set of (non-NaN) numerical data. The input data in `costs` can be either a pandas DataFrame or a NumPy ndarray of shape (observations, objectives). The user is responsible for dealing with NaN values *before* sending data to this function. Only numerical data is allowed, with the exception of `diff` (different) columns. Parameters ---------- costs : np.ndarray or pd.DataFrame Array or DataFrame of shape (observations, objectives). sense : list List with strings for each column (objective). The value `min` (default) indicates minimization, `max` indicates maximization and `diff` indicates different values. Using `diff` is equivalent to a group-by operation over the columns marked with `diff`. If None, minimization is assumed. distinct : bool How to treat duplicate rows. If `True`, only the first duplicate is returned. If `False`, every identical observation is returned instead.