def group_by_to_index(index: tp.Index, group_by: tp.GroupByLike) -> GroupByT: """Convert mapper `group_by` to `pd.Index`. !!! note Index and mapper must have the same length.""" if group_by is None or group_by is False: return group_by if group_by is True: group_by = pd.Index(np.full(len(index), 0)) # one group elif isinstance(group_by, (int, str)): group_by = index_fns.select_levels(index, group_by) elif checks.is_sequence(group_by): if len(group_by) != len(index) \ and isinstance(group_by[0], (int, str)) \ and isinstance(index, pd.MultiIndex) \ and len(group_by) <= len(index.names): try: group_by = index_fns.select_levels(index, group_by) except (IndexError, KeyError): pass if not isinstance(group_by, pd.Index): group_by = pd.Index(group_by) if len(group_by) != len(index): raise ValueError("group_by and index must have the same length") return group_by
def download(cls: tp.Type[DataT], symbols: tp.Union[tp.Label, tp.Labels], tz_localize: tp.Optional[tp.TimezoneLike] = None, tz_convert: tp.Optional[tp.TimezoneLike] = None, missing_index: tp.Optional[str] = None, missing_columns: tp.Optional[str] = None, wrapper_kwargs: tp.KwargsLike = None, **kwargs) -> DataT: """Download data using `Data.download_symbol`. Args: symbols (hashable or sequence of hashable): One or multiple symbols. !!! note Tuple is considered as a single symbol (since hashable). tz_localize (any): See `Data.from_data`. tz_convert (any): See `Data.from_data`. missing_index (str): See `Data.from_data`. missing_columns (str): See `Data.from_data`. wrapper_kwargs (dict): See `Data.from_data`. **kwargs: Passed to `Data.download_symbol`. If two symbols require different keyword arguments, pass `symbol_dict` for each argument. """ if checks.is_hashable(symbols): symbols = [symbols] elif not checks.is_sequence(symbols): raise TypeError("Symbols must be either hashable or sequence of hashable") data = dict() for s in symbols: # Select keyword arguments for this symbol _kwargs = cls.select_symbol_kwargs(s, kwargs) # Download data for this symbol data[s] = cls.download_symbol(s, **_kwargs) # Create new instance from data return cls.from_data( data, tz_localize=tz_localize, tz_convert=tz_convert, missing_index=missing_index, missing_columns=missing_columns, wrapper_kwargs=wrapper_kwargs, download_kwargs=kwargs )
def split_ranges_into_sets( start_idxs: tp.ArrayLike, end_idxs: tp.ArrayLike, set_lens: tp.MaybeSequence[tp.Sequence[float]] = (), left_to_right: tp.MaybeSequence[bool] = True) -> RangesT: """Generate ranges between each in `start_idxs` and `end_idxs` and optionally split into one or more sets. Args: start_idxs (array_like): Start indices. end_idxs (array_like): End indices. set_lens (list of float): Lengths of sets in each range. The number of returned sets is the length of `set_lens` plus one, which stores the remaining elements. Can be passed per range. left_to_right (bool or list of bool): Whether to resolve `set_lens` from left to right. Makes the last set variable, otherwise makes the first set variable. Can be passed per range. ## Example * `set_lens=(0.5)`: 50% in training set, the rest in test set * `set_lens=(0.5, 0.25)`: 50% in training set, 25% in validation set, the rest in test set * `set_lens=(50, 30)`: 50 in training set, 30 in validation set, the rest in test set * `set_lens=(50, 30)` and `left_to_right=False`: 30 in test set, 50 in validation set, the rest in training set """ start_idxs = np.asarray(start_idxs) end_idxs = np.asarray(end_idxs) checks.assert_len_equal(start_idxs, end_idxs) for i in range(len(start_idxs)): start_idx = start_idxs[i] end_idx = end_idxs[i] range_len = end_idx - start_idx + 1 new_set_lens = [] if len(set_lens) == 0: yield (np.arange(start_idx, end_idx + 1), ) else: if checks.is_sequence(set_lens[0]): _set_lens = set_lens[i] else: _set_lens = set_lens if checks.is_sequence(left_to_right): _left_to_right = left_to_right[i] else: _left_to_right = left_to_right for j, set_len in enumerate(_set_lens): if 0 < set_len < 1: set_len = math.floor(set_len * range_len) if set_len == 0: raise ValueError(f"Set {j} in the range {i} is empty") new_set_lens.append(set_len) if sum(new_set_lens) < range_len: if _left_to_right: new_set_lens = new_set_lens + [ range_len - sum(new_set_lens) ] else: new_set_lens = [range_len - sum(new_set_lens) ] + new_set_lens else: raise ValueError( f"Range of length {range_len} too short to split into {len(_set_lens) + 1} sets" ) # Split each range into sets idx_offset = 0 set_ranges = [] for set_len in new_set_lens: new_idx_offset = idx_offset + set_len set_ranges.append( np.arange(start_idx + idx_offset, start_idx + new_idx_offset)) idx_offset = new_idx_offset yield tuple(set_ranges)