def _can_load_distributed(source: Data) -> bool: """Returns True if it might be possible to use distributed data loading""" from xgboost_ray.data_sources.ml_dataset import MLDataset from xgboost_ray.data_sources.modin import Modin if isinstance(source, (int, float, bool)): return False elif MLDataset.is_data_type(source): return True elif Modin.is_data_type(source): return True elif isinstance(source, str): # Strings should point to files or URLs # Usually parquet files point to directories return source.endswith(".parquet") elif isinstance(source, Sequence): # Sequence of strings should point to files or URLs return isinstance(source[0], str) elif isinstance(source, Iterable): # If we get an iterable but not a sequence, the best we can do # is check if we have a known non-distributed object if isinstance(source, (pd.DataFrame, pd.Series, np.ndarray)): return False # Per default, allow distributed loading. return True
def _detect_distributed(source: Data) -> bool: """Returns True if we should try to use distributed data loading""" from xgboost_ray.data_sources.ml_dataset import MLDataset from xgboost_ray.data_sources.modin import Modin if not _can_load_distributed(source): return False if MLDataset.is_data_type(source): return True if Modin.is_data_type(source): return True if isinstance(source, Iterable) and not isinstance(source, str) and \ not (isinstance(source, Sequence) and isinstance(source[0], str)): # This is an iterable but not a Sequence of strings, and not a # pandas dataframe, series, or numpy array. # Detect False per default, can be overridden by passing # `distributed=True` to the RayDMatrix object. return False # Otherwise, assume distributed loading is possible return True