def arrays_to_mgr( arrays, columns: Index, index, *, dtype: DtypeObj | None = None, verify_integrity: bool = True, typ: str | None = None, consolidate: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ if verify_integrity: # figure out the index, if necessary if index is None: index = _extract_index(arrays) else: index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) # _homogenize ensures # - all(len(x) == len(index) for x in arrays) # - all(x.ndim == 1 for x in arrays) # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) # - all(type(x) is not PandasArray for x in arrays) else: index = ensure_index(index) arrays = [extract_array(x, extract_numpy=True) for x in arrays] # Reached via DataFrame._from_arrays; we do validation here for arr in arrays: if ( not isinstance(arr, (np.ndarray, ExtensionArray)) or arr.ndim != 1 or len(arr) != len(index) ): raise ValueError( "Arrays must be 1-dimensional np.ndarray or ExtensionArray " "with length matching len(index)" ) columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(arrays) must match len(columns)") # from BlockManager perspective axes = [columns, index] if typ == "block": return create_block_manager_from_column_arrays( arrays, axes, consolidate=consolidate ) elif typ == "array": return ArrayManager(arrays, [index, columns]) else: raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
def _extract_index(data) -> Index: """ Try to infer an Index from the passed data, raise ValueError on failure. """ index = None if len(data) == 0: index = Index([]) else: raw_lengths = [] indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False have_series = False have_dicts = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError("Per-column arrays must each be 1-dimensional") if not indexes and not raw_lengths: raise ValueError("If using all scalar values, you must pass an index") elif have_series: index = union_indexes(indexes) elif have_dicts: index = union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) if have_series: assert index is not None # for mypy if lengths[0] != len(index): msg = ( f"array length {lengths[0]} does not match index " f"length {len(index)}" ) raise ValueError(msg) else: index = default_index(lengths[0]) # error: Argument 1 to "ensure_index" has incompatible type "Optional[Index]"; # expected "Union[Union[Union[ExtensionArray, ndarray], Index, Series], # Sequence[Any]]" return ensure_index(index) # type: ignore[arg-type]
def to_arrays( data, columns: Index | None, dtype: DtypeObj | None = None ) -> tuple[list[ArrayLike], Index]: """ Return list of arrays, columns. Returns ------- list[ArrayLike] These will become columns in a DataFrame. Index This will become frame.columns. Notes ----- Ensures that len(result_arrays) == len(result_index). """ if isinstance(data, ABCDataFrame): # see test_from_records_with_index_data, test_from_records_bad_index_column if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): if data.dtype.names is not None: # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] if len(data) == 0: # GH#42456 the indexing above results in list of 2D ndarrays # TODO: is that an issue with numpy? for i, arr in enumerate(arrays): if arr.ndim == 2: arrays[i] = arr[:, 0] return arrays, columns return [], ensure_index([]) elif isinstance(data[0], Categorical): # GH#38845 deprecate special case warnings.warn( "The behavior of DataFrame([categorical, ...]) is deprecated and " "in a future version will be changed to match the behavior of " "DataFrame([any_listlike, ...]). " "To retain the old behavior, pass as a dictionary " "DataFrame({col: categorical, ..})", FutureWarning, stacklevel=find_stack_level(), ) if columns is None: columns = default_index(len(data)) elif len(columns) > len(data): raise ValueError("len(columns) > len(data)") elif len(columns) < len(data): # doing this here is akin to a pre-emptive reindex data = data[: len(columns)] return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): arr = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): arr, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): arr, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] arr = _list_to_arrays(data) content, columns = _finalize_columns_and_data(arr, columns, dtype) return content, columns
def ndarray_to_mgr( values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str ) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # if the array preparation does a copy -> avoid this for ArrayManager, # since the copy is done on conversion to 1D arrays copy_on_sanitize = False if typ == "array" else copy vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): # GH#19157 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values # error: No overload variant of "__getitem__" of "ExtensionArray" # matches argument type "Tuple[slice, int]" values = [ values[:, n] # type: ignore[call-overload] for n in range(values.shape[1]) ] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ, PeriodDtype values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarraylike(values, copy=copy_on_sanitize) if dtype is not None and not is_dtype_equal(values.dtype, dtype): # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array( values, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf, allow_2d=True, ) # _prep_ndarraylike ensures that values.ndim == 2 at this point index, columns = _get_axes( values.shape[0], values.shape[1], index=index, columns=columns ) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i]) ) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i] for i in range(values.shape[1])] if copy: arrays = [arr.copy() for arr in arrays] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ new_block_2d(dvals_list[n], placement=BlockPlacement(n)) for n in range(len(dvals_list)) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks( block_values, [columns, index], verify_integrity=False )
def dict_to_mgr( data: dict, index, columns, *, dtype: DtypeObj | None = None, typ: str = "block", copy: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Used in DataFrame.__init__ """ arrays: Sequence[Any] | Series if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = _extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): nan_dtype: DtypeObj if dtype is not None: # calling sanitize_array ensures we don't mix-and-match # NA dtypes midxs = missing.values.nonzero()[0] for i in midxs: arr = sanitize_array(arrays.iat[i], index, dtype=dtype) arrays.iat[i] = arr else: # GH#1783 nan_dtype = np.dtype("object") val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) nmissing = missing.sum() if copy: rhs = [val] * nmissing else: # GH#45369 rhs = [val.copy() for _ in range(nmissing)] arrays.loc[missing] = rhs arrays = list(arrays) columns = ensure_index(columns) else: keys = list(data.keys()) columns = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] if copy: if typ == "block": # We only need to copy arrays that will not get consolidated, i.e. # only EA arrays arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays] else: # dtype check to exclude e.g. range objects, scalars arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
def dict_to_mgr( data: dict, index, columns, *, dtype: DtypeObj | None = None, typ: str = "block", copy: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Used in DataFrame.__init__ """ arrays: Sequence[Any] | Series if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = _extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): nan_dtype: DtypeObj if dtype is None or (isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.flexible)): # GH#1783 nan_dtype = np.dtype("object") else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() arrays = list(arrays) columns = ensure_index(columns) else: keys = list(data.keys()) columns = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ arr if not isinstance(arr, Index) else arr._data for arr in arrays ] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] if copy: # arrays_to_mgr (via form_blocks) won't make copies for EAs # dtype attr check to exclude EADtype-castable strs arrays = [ x if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) else x.copy() for x in arrays ] # TODO: can we get rid of the dt64tz special case above? return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: if (levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) if levels is None: _, levels = factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): # Find matching codes, include matching nan values as equal. mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") i = np.nonzero(mask)[0][0] to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError( "Cannot concat indices that do not have the same number of levels" ) # also copies names = names + get_consensus_names(indexes) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError( f"Values not found in passed level: {hlevel[mask]!s}") new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def to_arrays(data, columns: Index | None, dtype: DtypeObj | None = None) -> tuple[list[ArrayLike], Index]: """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): # see test_from_records_with_index_data, test_from_records_bad_index_column if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): if data.dtype.names is not None: # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] return arrays, columns return [], ensure_index([]) elif isinstance(data[0], Categorical): # GH#38845 deprecate special case warnings.warn( "The behavior of DataFrame([categorical, ...]) is deprecated and " "in a future version will be changed to match the behavior of " "DataFrame([any_listlike, ...]). " "To retain the old behavior, pass as a dictionary " "DataFrame({col: categorical, ..})", FutureWarning, stacklevel=4, ) if columns is None: columns = ibase.default_index(len(data)) return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): arr = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): arr, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): arr, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] arr = _list_to_arrays(data) content, columns = _finalize_columns_and_data(arr, columns, dtype) return content, columns
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf) values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): dvals_list = [ ensure_block_shape(dval, 2) for dval in maybe_datetime ] block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() if not is_integer_dtype(dtype): # TODO: skipping integer_dtype is needed to keep the tests passing, # not clear it is correct # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=True) else: try: values = construct_1d_ndarray_preserving_na(flat, dtype=dtype, copy=False) except IntCastingNaNError: # following Series, we ignore the dtype and retain floating # values instead of casting nans to meaningless ints pass values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def to_arrays( data, columns: Optional[Index], dtype: Optional[DtypeObj] = None) -> Tuple[List[ArrayLike], Index]: """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): # error: Incompatible types in assignment (expression has type # "Optional[Tuple[str, ...]]", variable has type "Optional[Index]") columns = data.dtype.names # type: ignore[assignment] if columns is not None: # i.e. numpy structured array arrays = [data[name] for name in columns] return arrays, ensure_index(columns) return [], ensure_index([]) elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): content = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): content, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): content, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] content = _list_to_arrays(data) # error: Incompatible types in assignment (expression has type "List[ndarray]", # variable has type "List[Union[Union[str, int, float, bool], Union[Any, Any, Any, # Any]]]") content, columns = _finalize_columns_and_data( # type: ignore[assignment] content, columns, dtype) # error: Incompatible return value type (got "Tuple[ndarray, Index]", expected # "Tuple[List[ExtensionArray], Index]") # error: Incompatible return value type (got "Tuple[ndarray, Index]", expected # "Tuple[List[ndarray], Index]") return content, columns # type: ignore[return-value]
def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Used in DataFrame.__init__ """ arrays: Union[Sequence[Any], Series] if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): if dtype is None or ( not is_extension_array_dtype(dtype) # error: Argument 1 to "issubdtype" has incompatible type # "Union[dtype, ExtensionDtype]"; expected "Union[dtype, None, # type, _SupportsDtype, str, Tuple[Any, int], Tuple[Any, # Union[int, Sequence[int]]], List[Any], _DtypeDict, Tuple[Any, # Any]]" and np.issubdtype(dtype, np.flexible) # type: ignore[arg-type] ): # GH#1783 # error: Value of type variable "_DTypeScalar" of "dtype" cannot be # "object" nan_dtype = np.dtype(object) # type: ignore[type-var] else: # error: Incompatible types in assignment (expression has type # "Union[dtype, ExtensionDtype]", variable has type "dtype") nan_dtype = dtype # type: ignore[assignment] val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() else: keys = list(data.keys()) columns = data_names = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ arr if not isinstance(arr, ABCIndex) else arr._data for arr in arrays ] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)