def test_datetimetz_dtype(self): dtype = DatetimeTZDtype(unit='ns', tz='US/Eastern') assert find_common_type([dtype, dtype]) == 'datetime64[ns, US/Eastern]' for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), np.dtype('datetime64[ns]'), np.object, np.int64]: assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object
def test_period_dtype(self): dtype = PeriodDtype(freq='D') assert find_common_type([dtype, dtype]) == 'period[D]' for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), PeriodDtype(freq='2D'), PeriodDtype(freq='H'), np.dtype('datetime64[ns]'), np.object, np.int64]: assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object
def func(self, other, sort=True): other = self._as_like_interval_index(other) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): msg = ('can only do {op} between two IntervalIndex ' 'objects that have compatible dtypes') raise TypeError(msg.format(op=op_name)) if op_name == 'difference': result = getattr(self._multiindex, op_name)(other._multiindex, sort) else: result = getattr(self._multiindex, op_name)(other._multiindex) result_name = get_op_result_name(self, other) # GH 19101: ensure empty results have correct dtype if result.empty: result = result.values.astype(self.dtype.subtype) else: result = result.values return type(self).from_tuples(result, closed=self.closed, name=result_name)
def func(self, other, sort=sort): self._assert_can_do_setop(other) other = ensure_index(other) if not isinstance(other, IntervalIndex): result = getattr(self.astype(object), op_name)(other) if op_name in ('difference',): result = result.astype(self.dtype) return result elif self.closed != other.closed: msg = ('can only do set operations between two IntervalIndex ' 'objects that are closed on the same side') raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): msg = ('can only do {op} between two IntervalIndex ' 'objects that have compatible dtypes') raise TypeError(msg.format(op=op_name)) result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) # GH 19101: ensure empty results have correct dtype if result.empty: result = result.values.astype(self.dtype.subtype) else: result = result.values return type(self).from_tuples(result, closed=self.closed, name=result_name)
def _sparse_array_op(left, right, op, name): if name.startswith('__'): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] # dtype used to find corresponding sparse method if not is_dtype_equal(left.dtype, right.dtype): dtype = find_common_type([left.dtype, right.dtype]) left = left.astype(dtype) right = right.astype(dtype) else: dtype = left.dtype # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all='ignore'): result = op(left.get_values(), right.get_values()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all='ignore'): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == 'r': left, right = right, left name = name[1:] if name in ('and', 'or') and dtype == 'bool': opname = 'sparse_{name}_uint8'.format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all='ignore'): result, index, fill = sparse_op(left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def to_coo(self): """ Return the contents of the frame as a sparse SciPy COO matrix. .. versionadded:: 0.20.0 Returns ------- coo_matrix : scipy.sparse.spmatrix If the caller is heterogeneous and contains booleans or objects, the result will be of dtype=object. See Notes. Notes ----- The dtype will be the lowest-common-denominator type (implicit upcasting); that is to say if the dtypes (even of numeric types) are mixed, the one that accommodates all will be chosen. e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. By numpy.find_common_type convention, mixing int64 and and uint64 will result in a float64 dtype. """ try: from scipy.sparse import coo_matrix except ImportError: raise ImportError('Scipy is not installed') dtype = find_common_type(self.dtypes) if isinstance(dtype, SparseDtype): dtype = dtype.subtype cols, rows, datas = [], [], [] for col, name in enumerate(self): s = self[name] row = s.sp_index.to_int_index().indices cols.append(np.repeat(col, len(row))) rows.append(row) datas.append(s.sp_values.astype(dtype, copy=False)) cols = np.concatenate(cols) rows = np.concatenate(rows) datas = np.concatenate(datas) return coo_matrix((datas, (rows, cols)), shape=self.shape)
def na_op(x, y): import pandas.core.computation.expressions as expressions try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) mask = notna(x) & notna(y) result[mask] = op(x[mask], com._values_from_object(y[mask])) else: assert isinstance(x, np.ndarray) result = np.empty(len(x), dtype=x.dtype) mask = notna(x) result[mask] = op(x[mask], y) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def func(intvidx_self, other, sort=False): intvidx_self._assert_can_do_setop(other) other = ensure_index(other) if not isinstance(other, IntervalIndex): result = getattr(intvidx_self.astype(object), self.op_name)(other) if self.op_name in ('difference',): result = result.astype(intvidx_self.dtype) return result elif intvidx_self.closed != other.closed: msg = ('can only do set operations between two IntervalIndex ' 'objects that are closed on the same side') raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): msg = ('can only do {op} between two IntervalIndex ' 'objects that have compatible dtypes') raise TypeError(msg.format(op=self.op_name)) return setop(intvidx_self, other, sort)
def func(self, other): msg = ('can only do set operations between two IntervalIndex ' 'objects that are closed on the same side') other = self._as_like_interval_index(other, msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): msg = ('can only do {op} between two IntervalIndex ' 'objects that have compatible dtypes') raise TypeError(msg.format(op=op_name)) result = getattr(self._multiindex, op_name)(other._multiindex) result_name = self.name if self.name == other.name else None # GH 19101: ensure empty results have correct dtype if result.empty: result = result.values.astype(self.dtype.subtype) else: result = result.values return type(self).from_tuples(result, closed=self.closed, name=result_name)
def _sparse_array_op(left, right, op, name, series=False): if series and is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf if name in ('floordiv', 'mod') and (right.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) # dtype used to find corresponding sparse method if not is_dtype_equal(left.dtype, right.dtype): dtype = find_common_type([left.dtype, right.dtype]) left = left.astype(dtype) right = right.astype(dtype) else: dtype = left.dtype # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all='ignore'): result = op(left.get_values(), right.get_values()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all='ignore'): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == 'r': left, right = right, left name = name[1:] if name in ('and', 'or') and dtype == 'bool': opname = 'sparse_{name}_uint8'.format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all='ignore'): result, index, fill = sparse_op(left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def test_period_dtype_match(): dtype = PeriodDtype(freq="D") assert find_common_type([dtype, dtype]) == "period[D]"
def test_datetimetz_dtype_match(): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]"
def test_raises_empty_input(): with pytest.raises(ValueError, match="no types given"): find_common_type([])
def test_period_dtype_mismatch(dtype2): dtype = PeriodDtype(freq="D") assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: if not isinstance(dtype, IntervalDtype): return False common_subtype = find_common_type([self.dtype, dtype]) return not is_object_dtype(common_subtype)
def test_period_dtype_mismatch(dtype2): dtype = PeriodDtype(freq="D") assert find_common_type([dtype, dtype2]) == object assert find_common_type([dtype2, dtype]) == object
def read_json( cls, path_or_buf=None, orient=None, typ="frame", dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression="infer", ): kwargs = { "path_or_buf": path_or_buf, "orient": orient, "typ": typ, "dtype": dtype, "convert_axes": convert_axes, "convert_dates": convert_dates, "keep_default_dates": keep_default_dates, "numpy": numpy, "precise_float": precise_float, "date_unit": date_unit, "encoding": encoding, "lines": lines, "chunksize": chunksize, "compression": compression, } if cls.read_json_remote_task is None: return super(RayIO, cls).read_json(**kwargs) if not lines: ErrorMessage.default_to_pandas( "`read_json` only optimized with `lines=True`") return super(RayIO, cls).read_json(**kwargs) else: # TODO: Pick up the columns in an optimized way from all data # All rows must be read because some rows may have missing data # Currently assumes all rows have the same columns from io import BytesIO columns = pandas.read_json( BytesIO(b"" + open(path_or_buf, "rb").readline()), lines=True).columns kwargs["columns"] = columns empty_pd_df = pandas.DataFrame(columns=columns) path_or_buf = kwargs.pop("path_or_buf") with file_open(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: total_bytes = file_size(f) from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS num_splits = min(len(columns), num_partitions) chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) partition_ids = [] index_ids = [] dtypes_ids = [] column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(columns): column_widths = [len(columns)] num_splits = 1 else: column_widths = [ column_chunksize if i != num_splits - 1 else len(columns) - (column_chunksize * (num_splits - 1)) for i in range(num_splits) ] while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) f.readline() partition_id = cls.read_json_remote_task._remote( args=(path_or_buf, num_splits, start, f.tell(), kwargs), num_return_vals=num_splits + 3, ) partition_ids.append(partition_id[:-3]) index_ids.append(partition_id[-3]) dtypes_ids.append(partition_id[-2]) row_lengths = ray.get(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply( lambda row: find_common_type(row.values), axis=1).squeeze(axis=0)) partition_ids = [[ cls.frame_partition_cls( partition_ids[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids))] if isinstance(dtypes, pandas.Series): dtypes.index = columns else: dtypes = pandas.Series(dtypes, index=columns) new_frame = cls.frame_cls( np.array(partition_ids), new_index, columns, row_lengths, column_widths, dtypes=dtypes, ) new_frame._apply_index_objs(axis=0) return cls.query_compiler_cls(new_frame)
def _read_csv_from_file_ray(cls, filepath, kwargs={}): """Constructs a DataFrame from a CSV file. Args: filepath (str): path to the CSV file. npartitions (int): number of partitions for the DataFrame. kwargs (dict): args excluding filepath provided to read_csv. Returns: DataFrame or Series constructed from CSV file. """ names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly kwargs["index_col"] = None names = pandas.read_csv(filepath, **dict(kwargs, nrows=0, skipfooter=0)).columns kwargs["index_col"] = index_col empty_pd_df = pandas.read_csv(filepath, **dict(kwargs, nrows=0, skipfooter=0)) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols = kwargs.get("usecols", None) usecols_md = _validate_usecols_arg(kwargs.get("usecols", None)) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv(file_open(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) with file_open(filepath, "rb", kwargs.get("compression", "infer")) as f: # Get the BOM if necessary prefix = b"" if kwargs.get("encoding", None) is not None: prefix = f.readline() partition_kwargs["skiprows"] = 1 f.seek(0, os.SEEK_SET) # Return to beginning of file prefix_id = ray.put(prefix) partition_kwargs_id = ray.put(partition_kwargs) # Skip the header since we already have the header information and skip the # rows we are told to skip. kwargs["skiprows"] = skiprows cls._skip_header(f, kwargs) # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] total_bytes = file_size(f) # Max number of partitions available from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # This is the chunksize each partition will read chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) f.readline() # Read a whole number of lines # The workers return multiple objects for each part of the file read: # - The first n - 2 objects are partitions of data # - The n - 1 object is the length of the partition or the index if # `index_col` is specified. We compute the index below. # - The nth object is the dtypes of the partition. We combine these to # form the final dtypes below. partition_id = cls.read_csv_remote_task._remote( args=( filepath, num_splits, start, f.tell(), partition_kwargs_id, prefix_id, ), num_return_vals=num_splits + 2, ) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = ray.get(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = ray.get(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = empty_pd_df.index.name # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply( lambda row: find_common_type(row.values), axis=1).squeeze(axis=0)) partition_ids = [[ cls.frame_partition_cls(partition_ids[i][j], length=row_lengths[i], width=column_widths[j]) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids))] # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # We have to recompute the column widths if `parse_dates` is set because # we are not guaranteed to have the correct information regarding how many # columns are on each partition. column_widths = None # Check if is list of lists if isinstance(parse_dates, list) and isinstance( parse_dates[0], list): for group in parse_dates: new_col_name = "_".join(group) column_names = column_names.drop(group).insert( 0, new_col_name) # Check if it is a dictionary elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert( 0, new_col_name) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( partition_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:]) if kwargs.get("squeeze", False) and len( new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] if index_col is None: new_query_compiler._modin_frame._apply_index_objs(axis=0) return new_query_compiler
def test_datetimetz_dtype_match(): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]"
def test_categorical_dtype(dtypes, exp_type): assert find_common_type(dtypes) == exp_type
def test_raises_empty_input(): with pytest.raises(ValueError, match="no types given"): find_common_type([])
def test_numpy_dtypes(source_dtypes, expected_common_dtype): assert find_common_type(source_dtypes) == expected_common_dtype
def test_numpy_dtypes(self): # (source_types, destination_type) testcases = ( # identity ((np.int64,), np.int64), ((np.uint64,), np.uint64), ((np.float32,), np.float32), ((np.object,), np.object), # into ints ((np.int16, np.int64), np.int64), ((np.int32, np.uint32), np.int64), ((np.uint16, np.uint64), np.uint64), # into floats ((np.float16, np.float32), np.float32), ((np.float16, np.int16), np.float32), ((np.float32, np.int16), np.float32), ((np.uint64, np.int64), np.float64), ((np.int16, np.float64), np.float64), ((np.float16, np.int64), np.float64), # into others ((np.complex128, np.int32), np.complex128), ((np.object, np.float32), np.object), ((np.object, np.int16), np.object), # bool with int ((np.dtype('bool'), np.int64), np.object), ((np.dtype('bool'), np.int32), np.object), ((np.dtype('bool'), np.int16), np.object), ((np.dtype('bool'), np.int8), np.object), ((np.dtype('bool'), np.uint64), np.object), ((np.dtype('bool'), np.uint32), np.object), ((np.dtype('bool'), np.uint16), np.object), ((np.dtype('bool'), np.uint8), np.object), # bool with float ((np.dtype('bool'), np.float64), np.object), ((np.dtype('bool'), np.float32), np.object), ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), np.dtype('datetime64[ns]')), ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), np.dtype('timedelta64[ns]')), ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ms]')), np.dtype('datetime64[ns]')), ((np.dtype('timedelta64[ms]'), np.dtype('timedelta64[ns]')), np.dtype('timedelta64[ns]')), ((np.dtype('datetime64[ns]'), np.dtype('timedelta64[ns]')), np.object), ((np.dtype('datetime64[ns]'), np.int64), np.object) ) for src, common in testcases: assert find_common_type(src) == common with pytest.raises(ValueError): # empty find_common_type([])
def get_dtype(X): try: from pandas.core.dtypes.cast import find_common_type return find_common_type(X.dtypes) if is_DataFrame(X) else X.dtype except ImportError: return getattr(X, "dtype", None)
def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] if non_empties and axis == 0: to_concat = non_empties typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [ _cast_to_common_type(arr, target_dtype) for arr in to_concat ] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) typs = get_dtype_kinds(to_concat) if len(typs) != 1: if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): # let numpy coerce pass else: # coerce to object to_concat = [x.astype("object") for x in to_concat] return np.concatenate(to_concat, axis=axis)
def _sparse_array_op(left, right, op, name, series=False): if series and is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf if name in ('floordiv', 'mod') and (right.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) # dtype used to find corresponding sparse method if not is_dtype_equal(left.dtype, right.dtype): dtype = find_common_type([left.dtype, right.dtype]) left = left.astype(dtype) right = right.astype(dtype) else: dtype = left.dtype # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all='ignore'): result = op(left.get_values(), right.get_values()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all='ignore'): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == 'r': left, right = right, left name = name[1:] if name in ('and', 'or') and dtype == 'bool': opname = 'sparse_{name}_uint8'.format(name=name, dtype=dtype) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all='ignore'): result, index, fill = sparse_op(left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def _masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). Parameters ---------- x : np.ndarray y : np.ndarray, Series, Index op : binary operator """ # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes # the logic valid for both Series and DataFrame ops. xrav = x.ravel() assert isinstance(x, np.ndarray), type(x) if isinstance(y, np.ndarray): dtype = find_common_type([x.dtype, y.dtype]) # error: Argument "dtype" to "empty" has incompatible type # "Union[dtype, ExtensionDtype]"; expected "Union[dtype, None, type, # _SupportsDtype, str, Tuple[Any, int], Tuple[Any, Union[int, # Sequence[int]]], List[Any], _DtypeDict, Tuple[Any, Any]]" result = np.empty(x.size, dtype=dtype) # type: ignore[arg-type] if len(x) != len(y): raise ValueError(x.shape, y.shape) else: ymask = notna(y) # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex # we would get int64 dtype, see GH#19956 yrav = y.ravel() mask = notna(xrav) & ymask.ravel() # See GH#5284, GH#5035, GH#19448 for historical reference if mask.any(): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], yrav[mask]) else: if not is_scalar(y): raise TypeError( f"Cannot broadcast np.ndarray with operand of type { type(y) }" ) # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. if op is pow: mask = np.where(x == 1, False, mask) elif op is rpow: mask = np.where(y == 1, False, mask) if mask.any(): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) result = maybe_upcast_putmask(result, ~mask) result = result.reshape(x.shape) # 2D compat return result
def test_raises_empty_input(self): with pytest.raises(ValueError): find_common_type([])
def putmask_smart(values: np.ndarray, mask: np.ndarray, new) -> np.ndarray: """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- values : np.ndarray `values`, updated in-place. mask : np.ndarray[bool] Applies to both sides (array like). new : `new values` either scalar or an array like aligned with `values` Returns ------- values : ndarray with updated values this *may* be a copy of the original See Also -------- ndarray.putmask """ # we cannot use np.asarray() here as we cannot have conversions # that numpy does when numeric are mixed with strings # n should be the length of the mask or a scalar here if not is_list_like(new): new = np.broadcast_to(new, mask.shape) # see if we are only masking values that if putted # will work in the current dtype try: nn = new[mask] except TypeError: # TypeError: only integer scalar arrays can be converted to a scalar index pass else: # make sure that we have a nullable type if we have nulls if not isna_compat(values, nn[0]): pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats pass elif not (is_float_dtype(values.dtype) or is_integer_dtype(values.dtype)): # only compare integers/floats pass else: # we ignore ComplexWarning here with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", np.ComplexWarning) nn_at = nn.astype(values.dtype) comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = values.copy() nv[mask] = nn_at return nv new = np.asarray(new) if values.dtype.kind == new.dtype.kind: # preserves dtype if possible return _putmask_preserve(values, new, mask) dtype = find_common_type([values.dtype, new.dtype]) # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], # List[Any], _DTypeDict, Tuple[Any, Any]]]" values = values.astype(dtype) # type: ignore[arg-type] return _putmask_preserve(values, new, mask)
def test_numpy_dtypes(source_dtypes, expected_common_dtype): assert find_common_type(source_dtypes) == expected_common_dtype
def _sparse_array_op( left: "SparseArray", right: "SparseArray", op: Callable, name: str ) -> Any: """ Perform a binary operation between two arrays. Parameters ---------- left : Union[SparseArray, ndarray] right : Union[SparseArray, ndarray] op : Callable The binary operation to perform name str Name of the callable. Returns ------- SparseArray """ if name.startswith("__"): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] # dtype used to find corresponding sparse method ltype = left.dtype.subtype rtype = right.dtype.subtype if not is_dtype_equal(ltype, rtype): subtype = find_common_type([ltype, rtype]) ltype = SparseDtype(subtype, left.fill_value) rtype = SparseDtype(subtype, right.fill_value) # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe left = left.astype(ltype) right = right.astype(rtype) dtype = ltype.subtype else: dtype = ltype # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all="ignore"): result = op(left.to_dense(), right.to_dense()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all="ignore"): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == "r": left, right = right, left name = name[1:] if name in ("and", "or", "xor") and dtype == "bool": opname = f"sparse_{name}_uint8" # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = bool else: opname = f"sparse_{name}_{dtype}" left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all="ignore"): result, index, fill = sparse_op( left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value, ) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def test_categorical_dtype(dtypes, exp_type): assert find_common_type(dtypes) == exp_type
def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation ea_compat_axis : bool, default False For ExtensionArray compat, behave as if axis == 1 when determining whether to drop empty arrays. Returns ------- a single array, preserving the combined dtypes """ # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] if non_empties and axis == 0 and not ea_compat_axis: # ea_compat_axis see GH#39574 to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} contains_datetime = any(kind in ["m", "M"] for kind in kinds) or any( isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) if contains_datetime: return _concat_datetime(to_concat, axis=axis) if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [ cast_to_common_type(arr, target_dtype) for arr in to_concat ] if isinstance(to_concat[0], ABCExtensionArray): # TODO: what about EA-backed Index? cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: return np.concatenate(to_concat) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) if len(kinds) != 1: if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): # let numpy coerce pass else: # coerce to object to_concat = [x.astype("object") for x in to_concat] return np.concatenate(to_concat, axis=axis)
def test_datetimetz_dtype_mismatch(dtype2): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object
def masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). Parameters ---------- x : np.ndarray y : np.ndarray, Series, Index op : binary operator """ # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes # the logic valid for both Series and DataFrame ops. xrav = x.ravel() assert isinstance(x, np.ndarray), type(x) if isinstance(y, np.ndarray): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) if len(x) != len(y): raise ValueError(x.shape, y.shape) else: ymask = notna(y) # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex # we would get int64 dtype, see GH#19956 yrav = y.ravel() mask = notna(xrav) & ymask.ravel() if yrav.shape != mask.shape: # FIXME: GH#5284, GH#5035, GH#19448 # Without specifically raising here we get mismatched # errors in Py3 (TypeError) vs Py2 (ValueError) # Note: Only = an issue in DataFrame case raise ValueError("Cannot broadcast operands together.") if mask.any(): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], yrav[mask]) else: if not is_scalar(y): raise TypeError( f"Cannot broadcast np.ndarray with operand of type { type(y) }" ) # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. if op is pow: mask = np.where(x == 1, False, mask) elif op is rpow: mask = np.where(y == 1, False, mask) if mask.any(): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) result, _ = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) # 2D compat return result
def test_categorical_dtype(self): dtype = CategoricalDtype() assert find_common_type([dtype]) == 'category' assert find_common_type([dtype, dtype]) == 'category' assert find_common_type([np.object, dtype]) == np.object
def _sparse_array_op(left, right, op, name): """ Perform a binary operation between two arrays. Parameters ---------- left : Union[SparseArray, ndarray] right : Union[SparseArray, ndarray] op : Callable The binary operation to perform name str Name of the callable. Returns ------- SparseArray """ # type: (SparseArray, SparseArray, Callable, str) -> Any if name.startswith('__'): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] # dtype used to find corresponding sparse method ltype = left.dtype.subtype rtype = right.dtype.subtype if not is_dtype_equal(ltype, rtype): subtype = find_common_type([ltype, rtype]) ltype = SparseDtype(subtype, left.fill_value) rtype = SparseDtype(subtype, right.fill_value) # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe left = left.astype(ltype) right = right.astype(rtype) dtype = ltype.subtype else: dtype = ltype # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all='ignore'): result = op(left.get_values(), right.get_values()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all='ignore'): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == 'r': left, right = right, left name = name[1:] if name in ('and', 'or') and dtype == 'bool': opname = 'sparse_{name}_uint8'.format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all='ignore'): result, index, fill = sparse_op( left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def test_raises_empty_input(self): with pytest.raises(ValueError): find_common_type([])
def test_datetimetz_dtype_mismatch(dtype2): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") assert find_common_type([dtype, dtype2]) == object assert find_common_type([dtype2, dtype]) == object
def test_categorical_dtype(self): dtype = CategoricalDtype() assert find_common_type([dtype]) == 'category' assert find_common_type([dtype, dtype]) == 'category' assert find_common_type([np.object, dtype]) == np.object
def test_categorical_dtype(self): dtype = CategoricalDtype() self.assertEqual(find_common_type([dtype]), 'category') self.assertEqual(find_common_type([dtype, dtype]), 'category') self.assertEqual(find_common_type([np.object, dtype]), np.object)
def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: if not self.need_postprocess(index_info, context): # do not need postprocess return chunks, nsplits = context.out_chunks, context.out_nsplits index_to_chunks = {c.index: c for c in chunks} axis = index_info.output_axis new_out_chunks = [] chunk_axis_shapes = dict() for chunk_index in itertools.product(*(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != axis)): to_concat_chunks = [] for i in range(len(nsplits[axis])): if axis == 0: to_concat_index = (i, ) + chunk_index else: to_concat_index = chunk_index + (i, ) to_concat_chunks.append(index_to_chunks[to_concat_index]) concat_chunk = context.concat_chunks(to_concat_chunks, axis) chunk_op = context.op.copy().reset_key() indexes = [slice(None)] * len(nsplits) indexes[axis] = index_info.raw_index params = concat_chunk.params if np.isscalar(index_info.raw_index): assert axis == 0 if "columns_value" in params: params["index_value"] = params.pop("columns_value") params["dtype"] = find_common_type( params["dtypes"].tolist()) del params["dtypes"] if getattr(context.op.outputs[0], "name", None) is not None: params["name"] = context.op.outputs[0].name if len(params["index"]) == chunks[0].ndim: index = list(params["index"]) index.pop(index_info.output_axis) params["index"] = tuple(index) shape = list(params["shape"]) shape.pop(index_info.output_axis) params["shape"] = tuple(shape) if context.op.outputs[0].ndim == 0: del params["index_value"] elif axis == 0: pd_index = pd.Index(index_info.raw_index) params["index_value"] = parse_index(pd_index, store_data=False) shape = list(params["shape"]) shape[0] = len(pd_index) params["shape"] = shape else: if context.op.can_index_miss: # reindex params["dtypes"] = dtypes = to_concat_chunks[0].dtypes else: params["dtypes"] = dtypes = concat_chunk.dtypes.loc[ index_info.raw_index] params["columns_value"] = parse_index(dtypes.index, store_data=True) shape = list(params["shape"]) shape[1] = len(dtypes) params["shape"] = tuple(shape) chunk_op._indexes = indexes chunk_op.stage = OperandStage.agg out_chunk = chunk_op.new_chunk([concat_chunk], kws=[params]) if len(out_chunk.shape) != 0: chunk_axis_shapes[ out_chunk.index[axis]] = out_chunk.shape[axis] new_out_chunks.append(out_chunk) new_nsplits = list(nsplits) if np.isscalar(index_info.raw_index): new_nsplits = new_nsplits[:axis] + new_nsplits[axis + 1:] else: new_nsplits[axis] = (sum(chunk_axis_shapes.values()), ) context.out_chunks = new_out_chunks context.out_nsplits = new_nsplits
def test_period_dtype_match(): dtype = PeriodDtype(freq="D") assert find_common_type([dtype, dtype]) == "period[D]"