def _get_axes(N, K, index, columns): # helper to create the axes as indexes # return axes or defaults if index is None: index = ibase.default_index(N) else: index = ensure_index(index) if columns is None: columns = ibase.default_index(K) else: columns = ensure_index(columns) return index, columns
def _prep_index(self, data, index, columns): N, K = data.shape if index is None: index = ibase.default_index(N) if columns is None: columns = ibase.default_index(K) if len(columns) != K: raise ValueError('Column length mismatch: {columns} vs. {K}' .format(columns=len(columns), K=K)) if len(index) != N: raise ValueError('Index length mismatch: {index} vs. {N}' .format(index=len(index), N=N)) return index, columns
def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: columns = _get_objs_combined_axis(data, sort=False) indexer_cache = {} aligned_values = [] for s in data: index = getattr(s, 'index', None) if index is None: index = ibase.default_index(len(s)) if id(index) in indexer_cache: indexer = indexer_cache[id(index)] else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) values = com.values_from_object(s) aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) if values.dtype == np.object_: content = list(values.T) return _convert_object_array(content, columns, dtype=dtype, coerce_float=coerce_float) else: return values.T, columns
def _get_concat_axis(self): """ Return index to be used along concatenation axis. """ if self._is_series: if self.axis == 0: indexes = [x.index for x in self.objs] elif self.ignore_index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: names = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): if not isinstance(x, Series): raise TypeError("Cannot concatenate type 'Series' " "with object of type {type!r}" .format(type=type(x).__name__)) if x.name is not None: names[i] = x.name has_names = True else: names[i] = num num += 1 if has_names: return Index(names) else: return ibase.default_index(len(self.objs)) else: return ensure_index(self.keys).set_names(self.names) else: indexes = [x._data.axes[self.axis] for x in self.objs] if self.ignore_index: idx = ibase.default_index(sum(len(i) for i in indexes)) return idx if self.keys is None: concat_axis = _concat_indexes(indexes) else: concat_axis = _make_concat_multiindex(indexes, self.keys, self.levels, self.names) self._maybe_check_integrity(concat_axis) return concat_axis
def extract_index(data): index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_series = False have_dicts = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, 'ndim', 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' ' an index') if have_series or have_dicts: index = _union_indexes(indexes) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('arrays must all be same length') if have_dicts: raise ValueError('Mixing dicts with non-Series may lead to ' 'ambiguous ordering.') if have_series: if lengths[0] != len(index): msg = ('array length {length} does not match index ' 'length {idx_len}' .format(length=lengths[0], idx_len=len(index))) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) return ensure_index(index)
def _get_names_from_index(data) -> Index: has_some_name = any(getattr(s, "name", None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) index: list[Hashable] = list(range(len(data))) count = 0 for i, s in enumerate(data): n = getattr(s, "name", None) if n is not None: index[i] = n else: index[i] = f"Unnamed {count}" count += 1 return Index(index)
def get_names_from_index(data): has_some_name = any(getattr(s, "name", None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) index = list(range(len(data))) count = 0 for i, s in enumerate(data): n = getattr(s, "name", None) if n is not None: index[i] = n else: index[i] = "Unnamed {count}".format(count=count) count += 1 return index
def get_names_from_index(data): has_some_name = any(getattr(s, 'name', None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) index = lrange(len(data)) count = 0 for i, s in enumerate(data): n = getattr(s, 'name', None) if n is not None: index[i] = n else: index[i] = 'Unnamed {count}'.format(count=count) count += 1 return index
def to_arrays(data, columns, dtype: Optional[DtypeObj] = None): """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): columns = data.dtype.names if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = list(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): content, columns = _list_to_arrays(data, columns) elif isinstance(data[0], abc.Mapping): content, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): content, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] content, columns = _list_to_arrays(data, columns) content, columns = _finalize_columns_and_data(content, columns, dtype) return content, columns
def to_arrays(data, columns, coerce_float=False, dtype=None): """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: arrays = [data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): columns = data.dtype.names if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], compat.Mapping): return _list_of_dict_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], ABCSeries): return _list_of_series_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and data.dtype.names is not None): columns = list(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns else: # last ditch effort data = lmap(tuple, data) return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
def _convert_object_array(content, columns, coerce_float=False, dtype=None): if columns is None: columns = ibase.default_index(len(content)) else: if len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError('{col:d} columns passed, passed data had ' '{con} columns'.format(col=len(columns), con=len(content))) # provide soft conversion of object dtypes def convert(arr): if dtype != object and dtype != np.object: arr = lib.maybe_convert_objects(arr, try_float=coerce_float) arr = maybe_cast_to_datetime(arr, dtype) return arr arrays = [convert(arr) for arr in content] return arrays, columns
def masked_rec_array_to_mgr(data, index, columns, dtype, copy): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fill_value = data.fill_value fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): # TODO: numpy docs suggest fv must be scalar, but could it be # non-scalar for object dtype? assert lib.is_scalar(fv), fv mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv new_arrays.append(arr) # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() return mgr
def masked_rec_array_to_mgr( data: "MaskedRecords", index, columns, dtype: Optional[DtypeObj], copy: bool ): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fdata = ma.getdata(data) if index is None: index = _get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed new_arrays = [] for col in arr_columns: arr = data[col] fv = arr.fill_value mask = ma.getmaskarray(arr) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv new_arrays.append(arr) # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() return mgr
def _list_of_series_to_arrays( data: List, columns: Union[Index, List], coerce_float: bool = False, dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [ x for x in data if isinstance(x, (ABCSeries, ABCDataFrame)) ] columns = get_objs_combined_axis(pass_data, sort=False) indexer_cache: Dict[int, Scalar] = {} aligned_values = [] for s in data: index = getattr(s, "index", None) if index is None: index = ibase.default_index(len(s)) if id(index) in indexer_cache: indexer = indexer_cache[id(index)] else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) if values.dtype == np.object_: content = list(values.T) columns = _validate_or_indexify_columns(content, columns) content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) return content, columns else: return values.T, columns
def rec_array_to_mgr( data: Union[MaskedRecords, np.recarray, np.ndarray], index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str, ): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fdata = ma.getdata(data) if index is None: index = _get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed if isinstance(data, np.ma.MaskedArray): new_arrays = fill_masked_arrays(data, arr_columns) else: new_arrays = arrays # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ) if copy: mgr = mgr.copy() return mgr
def masked_rec_array_to_mgr(data, index, columns, dtype, copy): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fill_value = data.fill_value fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv new_arrays.append(arr) # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() return mgr
def _list_of_series_to_arrays( data: List, columns: Optional[Index], ) -> Tuple[np.ndarray, Index]: # returned np.ndarray has ndim == 2 if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [ x for x in data if isinstance(x, (ABCSeries, ABCDataFrame)) ] columns = get_objs_combined_axis(pass_data, sort=False) indexer_cache: Dict[int, np.ndarray] = {} aligned_values = [] for s in data: index = getattr(s, "index", None) if index is None: index = ibase.default_index(len(s)) if id(index) in indexer_cache: indexer = indexer_cache[id(index)] else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_nd(values, indexer)) # error: Argument 1 to "vstack" has incompatible type "List[ExtensionArray]"; # expected "Sequence[Union[Union[int, float, complex, str, bytes, generic], # Sequence[Union[int, float, complex, str, bytes, generic]], # Sequence[Sequence[Any]], _SupportsArray]]" content = np.vstack(aligned_values) # type: ignore[arg-type] return content, columns
def to_arrays( data, columns: Optional[Index], dtype: Optional[DtypeObj] = None) -> Tuple[List[ArrayLike], Index]: """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): # error: Incompatible types in assignment (expression has type # "Optional[Tuple[str, ...]]", variable has type "Optional[Index]") columns = data.dtype.names # type: ignore[assignment] if columns is not None: # i.e. numpy structured array arrays = [data[name] for name in columns] return arrays, ensure_index(columns) return [], ensure_index([]) elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): content = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): content, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): content, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] content = _list_to_arrays(data) # error: Incompatible types in assignment (expression has type "List[ndarray]", # variable has type "List[Union[Union[str, int, float, bool], Union[Any, Any, Any, # Any]]]") content, columns = _finalize_columns_and_data( # type: ignore[assignment] content, columns, dtype) # error: Incompatible return value type (got "Tuple[ndarray, Index]", expected # "Tuple[List[ExtensionArray], Index]") # error: Incompatible return value type (got "Tuple[ndarray, Index]", expected # "Tuple[List[ndarray], Index]") return content, columns # type: ignore[return-value]
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): data = Series(data, index=index) index = data.index.view() res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() elif not data.index.equals(index) or copy: # pragma: no cover # GH#19275 SingleBlockManager input should only be called # internally raise AssertionError('Cannot pass both SingleBlockManager ' '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') else: length = len(index) if data == fill_value or (isna(data) and isna(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = ibase.default_index(sparse_index.length) index = ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def to_arrays(data, columns: Index | None, dtype: DtypeObj | None = None) -> tuple[list[ArrayLike], Index]: """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): if data.dtype.names is not None: # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] return arrays, columns return [], ensure_index([]) elif isinstance(data[0], Categorical): # GH#38845 deprecate special case warnings.warn( "The behavior of DataFrame([categorical, ...]) is deprecated and " "in a future version will be changed to match the behavior of " "DataFrame([any_listlike, ...]). " "To retain the old behavior, pass as a dictionary " "DataFrame({col: categorical, ..})", FutureWarning, stacklevel=4, ) if columns is None: columns = ibase.default_index(len(data)) return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): arr = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): arr, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): arr, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] arr = _list_to_arrays(data) content, columns = _finalize_columns_and_data(arr, columns, dtype) return content, columns
def _extract_index(data) -> Index: """ Try to infer an Index from the passed data, raise ValueError on failure. """ index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False have_series = False have_dicts = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError( "Per-column arrays must each be 1-dimensional") if not indexes and not raw_lengths: raise ValueError( "If using all scalar values, you must pass an index") if have_series: index = union_indexes(indexes) elif have_dicts: index = union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) if have_series: assert index is not None # for mypy if lengths[0] != len(index): msg = (f"array length {lengths[0]} does not match index " f"length {len(index)}") raise ValueError(msg) else: index = ibase.default_index(lengths[0]) # error: Argument 1 to "ensure_index" has incompatible type "Optional[Index]"; # expected "Union[Union[Union[ExtensionArray, ndarray], Index, Series], # Sequence[Any]]" return ensure_index(index) # type: ignore[arg-type]
def to_arrays(data, columns: Index | None, dtype: DtypeObj | None = None) -> tuple[list[ArrayLike], Index]: """ Return list of arrays, columns. Returns ------- list[ArrayLike] These will become columns in a DataFrame. Index This will become frame.columns. Notes ----- Ensures that len(result_arrays) == len(result_index). """ if isinstance(data, ABCDataFrame): # see test_from_records_with_index_data, test_from_records_bad_index_column if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): if data.dtype.names is not None: # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] if len(data) == 0: # GH#42456 the indexing above results in list of 2D ndarrays # TODO: is that an issue with numpy? for i, arr in enumerate(arrays): if arr.ndim == 2: arrays[i] = arr[:, 0] return arrays, columns return [], ensure_index([]) elif isinstance(data[0], Categorical): # GH#38845 deprecate special case warnings.warn( "The behavior of DataFrame([categorical, ...]) is deprecated and " "in a future version will be changed to match the behavior of " "DataFrame([any_listlike, ...]). " "To retain the old behavior, pass as a dictionary " "DataFrame({col: categorical, ..})", FutureWarning, stacklevel=4, ) if columns is None: columns = ibase.default_index(len(data)) elif len(columns) > len(data): raise ValueError("len(columns) > len(data)") elif len(columns) < len(data): # doing this here is akin to a pre-emptive reindex data = data[:len(columns)] return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): arr = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): arr, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): arr, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] arr = _list_to_arrays(data) content, columns = _finalize_columns_and_data(arr, columns, dtype) return content, columns