def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') a = array([1, 3, 2], dtype='Int64') result = safe_sort(a) # expected = array([1, 2, 3, np.nan], dtype='Int64') expected = array([1, 2, 3], dtype='Int64') tm.assert_extension_array_equal(result, expected)
def test_extension_array_labels(self, verify, na_sentinel): a = array([1, 3, 2], dtype='Int64') result, labels = safe_sort(a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify) expected_values = array([1, 2, 3], dtype='Int64') expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(labels, expected_labels)
def test_add_column_with_pandas_array(self): # GH 26390 df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd']}) df['c'] = pd.array([1, 2, None, 3]) df2 = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd'], 'c': pd.array([1, 2, None, 3])}) assert type(df['c']._data.blocks[0]) == ObjectBlock assert type(df2['c']._data.blocks[0]) == ObjectBlock assert_frame_equal(df, df2)
def test_array_copy(): a = np.array([1, 2]) # default is to copy b = pd.array(a) assert np.shares_memory(a, b._ndarray) is False # copy=True b = pd.array(a, copy=True) assert np.shares_memory(a, b._ndarray) is False # copy=False b = pd.array(a, copy=False) assert np.shares_memory(a, b._ndarray) is True
def test_reductions_frame_dtypes(): df = pd.DataFrame({'int': [1, 2, 3, 4, 5, 6, 7, 8], 'float': [1., 2., 3., 4., np.nan, 6., 7., 8.], 'dt': [pd.NaT] + [datetime(2011, i, 1) for i in range(1, 8)], 'str': list('abcdefgh')}) if HAS_INT_NA: df['intna'] = pd.array([1, 2, 3, 4, None, 6, 7, 8], dtype=pd.Int64Dtype()) ddf = dd.from_pandas(df, 3) assert_eq(df.sum(), ddf.sum()) assert_eq(df.prod(), ddf.prod()) assert_eq(df.min(), ddf.min()) assert_eq(df.max(), ddf.max()) assert_eq(df.count(), ddf.count()) assert_eq(df.std(), ddf.std()) assert_eq(df.var(), ddf.var()) assert_eq(df.sem(), ddf.sem()) assert_eq(df.std(ddof=0), ddf.std(ddof=0)) assert_eq(df.var(ddof=0), ddf.var(ddof=0)) assert_eq(df.sem(ddof=0), ddf.sem(ddof=0)) result = ddf.mean() expected = df.mean() assert_eq(expected, result) assert_eq(df._get_numeric_data(), ddf._get_numeric_data()) numerics = ddf[['int', 'float']] assert numerics._get_numeric_data().dask == numerics.dask
def test_from_pandas_array(self): arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 result = DatetimeArray._from_sequence(arr, freq='infer') expected = pd.date_range('1970-01-01', periods=5, freq='H')._data tm.assert_datetime_array_equal(result, expected)
def test_array_not_registered(registry_without_decimal): # check we aren't on it assert registry.find('decimal') is None data = [decimal.Decimal('1'), decimal.Decimal('2')] result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected)
def test_array_unboxes(box): data = box([decimal.Decimal('1'), decimal.Decimal('2')]) # make sure it works with pytest.raises(TypeError): DecimalArray2._from_sequence(data) result = pd.array(data, dtype='decimal2') expected = DecimalArray2._from_sequence(data.values) tm.assert_equal(result, expected)
def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): arr = pd.array([1, 3, 90], dtype=any_real_dtype) result = arr.searchsorted(30) assert is_scalar(result) assert result == 2 result = arr.searchsorted([30]) expected = np.array([2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected)
def _nonempty_series(s, idx=None): # TODO: Use register dtypes with make_array_nonempty if idx is None: idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): data = [s.cat.categories[0]] * 2 cats = s.cat.categories else: data = _nonempty_index(s.cat.categories) cats = None data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) elif is_period_dtype(dtype): # pandas 0.24.0+ should infer this to be Series[Period[freq]] freq = dtype.freq data = [pd.Period('2000', freq), pd.Period('2001', freq)] elif is_sparse(dtype): # TODO: pandas <0.24 # Pandas <= 0.23.4: if PANDAS_GT_0240: entry = _scalar_from_dtype(dtype.subtype) else: entry = _scalar_from_dtype(dtype.subtype) data = pd.SparseArray([entry, entry], dtype=dtype) elif is_interval_dtype(dtype): entry = _scalar_from_dtype(dtype.subtype) if PANDAS_GT_0240: data = pd.array([entry, entry], dtype=dtype) else: data = np.array([entry, entry], dtype=dtype) elif type(dtype) in make_array_nonempty._lookup: data = make_array_nonempty(dtype) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) return pd.Series(data, name=s.name, index=idx)
def test_searchsorted(self, string_dtype): arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) result = arr.searchsorted('a', side='left') assert is_scalar(result) assert result == 0 result = arr.searchsorted('a', side='right') assert is_scalar(result) assert result == 1
def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.array([1, 2]) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) assert result.is_integer is True assert result.is_extension is False # PandasArray, PandasDtype result = make_block(arr, slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False # ndarray, PandasDtype result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False
def test_dataframe_reductions(op): # https://github.com/pandas-dev/pandas/pull/32867 # ensure the integers are not cast to float during reductions df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) result = df.max() assert isinstance(result["a"], np.int64)
def test_pandas_array_dtype(self, data): # ... but specifying dtype will override idempotency result = pd.array(data, dtype=np.dtype(object)) expected = pd.arrays.PandasArray(np.asarray(data, dtype=object)) self.assert_equal(result, expected)
class TestDataFrameCov: def test_cov(self, float_frame, float_string_frame): # min_periods no NAs (corner case) expected = float_frame.cov() result = float_frame.cov(min_periods=len(float_frame)) tm.assert_frame_equal(expected, result) result = float_frame.cov(min_periods=len(float_frame) + 1) assert isna(result.values).all() # with NAs frame = float_frame.copy() frame["A"][:5] = np.nan frame["B"][5:10] = np.nan result = frame.cov(min_periods=len(frame) - 8) expected = frame.cov() expected.loc["A", "B"] = np.nan expected.loc["B", "A"] = np.nan tm.assert_frame_equal(result, expected) # regular result = frame.cov() expected = frame["A"].cov(frame["C"]) tm.assert_almost_equal(result["A"]["C"], expected) # exclude non-numeric types result = float_string_frame.cov() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() tm.assert_frame_equal(result, expected) # Single column frame df = DataFrame(np.linspace(0.0, 1.0, 10)) result = df.cov() expected = DataFrame(np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns) tm.assert_frame_equal(result, expected) df.loc[0] = np.nan result = df.cov() expected = DataFrame( np.cov(df.values[1:].T).reshape((1, 1)), index=df.columns, columns=df.columns, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3]) def test_cov_ddof(self, test_ddof): # GH#34611 np_array1 = np.random.rand(10) np_array2 = np.random.rand(10) df = DataFrame({0: np_array1, 1: np_array2}) result = df.cov(ddof=test_ddof) expected_np = np.cov(np_array1, np_array2, ddof=test_ddof) expected = DataFrame(expected_np) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]) def test_cov_nullable_integer(self, other_column): # https://github.com/pandas-dev/pandas/issues/33803 data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) result = data.cov() arr = np.array([[0.5, 0.5], [0.5, 1.0]]) expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected)
if wins[i]== max_wins: leader.append(i) if(sum(remain)==0): for i in range(teams): eliminated.append(i) for i in range(len(leader)): eliminated.remove(leader[i]) for i in eliminated: #print("Teams which got eliminated are -") print(i, end=" ") else: i = 0 n = 5 layer1_values = [] for i in range(teams - 1): y = pd.array(data.iloc[i]) layer1_values.append(y[n:]) n = n + 1 Layer1_capacities = list(chain.from_iterable(layer1_values)) layer_id = 'L1' layer_1_ids = [] for i in range(0, len(Layer1_capacities)): node_id = layer_id + str(i) layer_1_ids.append(node_id) layer_id = 'L2' layer_2_ids = [] for i in range(0, teams): node_id = layer_id + str(i) layer_2_ids.append(node_id)
def test_searchsorted_sorter(self, any_real_dtype): arr = pd.array([3, 1, 2], dtype=any_real_dtype) result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) expected = np.array([0, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected)
def time_from_float_array(self): pd.array(self.values_float, dtype="boolean")
def test_min_max(method, skipna): arr = pd.Series(["a", "b", "c", None], dtype="string") result = getattr(arr, method)(skipna=skipna) if skipna: expected = "a" if method == "min" else "c" assert result == expected else: assert result is pd.NA @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize( "arr", [ pd.Series(["a", "b", "c", None], dtype="string"), pd.array(["a", "b", "c", None], dtype="string"), ], ) def test_min_max_numpy(method, arr): result = getattr(np, method)(arr) expected = "a" if method == "min" else "c" assert result == expected @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna): arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") result = arr.sum(skipna=skipna) if skipna: assert result == "abc"
def data_for_grouping(dtype): b = 0.1 a = 0.0 c = 0.2 na = pd.NA return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) tm.assert_frame_equal(result, expected)
def data_for_sorting(dtype): return pd.array([0.1, 0.2, 0.0], dtype=dtype)
def data_missing_for_sorting(dtype): return pd.array([0.1, pd.NA, 0.0], dtype=dtype)
def data_missing(dtype): return pd.array([pd.NA, 0.1], dtype=dtype)
def data_for_twos(dtype): return pd.array(np.ones(100) * 2, dtype=dtype)
def data(dtype): return pd.array(make_data(), dtype=dtype)
def test_array_inference_fails(data): result = pd.array(data) expected = PandasArray(np.array(data, dtype=object)) tm.assert_extension_array_equal(result, expected)
def test_replace_extension_other(self): # https://github.com/pandas-dev/pandas/issues/34530 ser = pd.Series(pd.array([1, 2, 3], dtype="Int64")) ser.replace("", "") # no exception
def test_scalar_raises(): with pytest.raises(ValueError, match="Cannot pass scalar '1'"): pd.array(1)
def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64")
def test_astype_int(): arr = pd.array(["1", pd.NA, "3"], dtype="string") result = arr.astype("Int64") expected = pd.array([1, pd.NA, 3], dtype="Int64") tm.assert_extension_array_equal(result, expected)
def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): arr = pd.array([1, 3, 90], dtype=any_real_dtype) result = arr.searchsorted([2, 30]) expected = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected)
usecols=[0, 1, 6], parse_dates=[1]) print(df.head()) # In[3]: start = pd.to_datetime('2019-04-01', format='%Y-%m-%d') end = pd.to_datetime('2019-07-01', format='%Y-%m-%d') print(f'start: {start}') print(f'end: {end}') # In[4]: s_full = pd.array(df.iloc[:, 0]) t_full = pd.array(pd.DatetimeIndex(df.iloc[:, 1]).astype( np.int64)) / 1000000000 t_full = np.extract([s_full == 2], t_full) dt = t_full[1] - t_full[0] print(f'data sampling is {dt:.2f} secs') # In[5]: t_start = pd.DatetimeIndex([start]).astype(np.int64) / 1000000000 t_end = pd.DatetimeIndex([end]).astype(np.int64) / 1000000000 t = np.extract([(t_full >= t_start[0]) & (t_full <= t_end[0])], t_full)
def test_take_all_empty(self): a = pd.array([0, 0], dtype=pd.SparseDtype("int64")) result = a.take([0, 1], allow_fill=True, fill_value=np.nan) tm.assert_sp_array_equal(a, result)
def test_from_array(self): result = pd.Series(pd.array(['1H', '2H'], dtype='timedelta64[ns]')) assert result._data.blocks[0].is_extension is False result = pd.Series(pd.array(['2015'], dtype='datetime64[ns]')) assert result._data.blocks[0].is_extension is False
def test_pandas_array(self, data): # pd.array(extension_array) should be idempotent... result = pd.array(data) self.assert_extension_array_equal(result, data)
def test_is_string_dtype_nullable(nullable_string_dtype): assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
def test_search_sorted_datetime64_scalar(self, arr, val): arr = pd.array(arr) result = arr.searchsorted(val) assert is_scalar(result) assert result == 1
class BaseGetitemTests(BaseExtensionTests): """Tests for ExtensionArray.__getitem__.""" def test_iloc_series(self, data): ser = pd.Series(data) result = ser.iloc[:4] expected = pd.Series(data[:4]) self.assert_series_equal(result, expected) result = ser.iloc[[0, 1, 2, 3]] self.assert_series_equal(result, expected) def test_iloc_frame(self, data): df = pd.DataFrame({ "A": data, "B": np.arange(len(data), dtype="int64") }) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame result = df.iloc[:4, [0]] self.assert_frame_equal(result, expected) # sequence -> frame result = df.iloc[[0, 1, 2, 3], [0]] self.assert_frame_equal(result, expected) expected = pd.Series(data[:4], name="A") # slice -> series result = df.iloc[:4, 0] self.assert_series_equal(result, expected) # sequence -> series result = df.iloc[:4, 0] self.assert_series_equal(result, expected) # GH#32959 slice columns with step result = df.iloc[:, ::2] self.assert_frame_equal(result, df[["A"]]) result = df[["B", "A"]].iloc[:, ::2] self.assert_frame_equal(result, df[["B"]]) def test_iloc_frame_single_block(self, data): # GH#32959 null slice along index, slice along columns with single-block df = pd.DataFrame({"A": data}) result = df.iloc[:, :] self.assert_frame_equal(result, df) result = df.iloc[:, :1] self.assert_frame_equal(result, df) result = df.iloc[:, :2] self.assert_frame_equal(result, df) result = df.iloc[:, ::2] self.assert_frame_equal(result, df) result = df.iloc[:, 1:2] self.assert_frame_equal(result, df.iloc[:, :0]) result = df.iloc[:, -1:] self.assert_frame_equal(result, df) def test_loc_series(self, data): ser = pd.Series(data) result = ser.loc[:3] expected = pd.Series(data[:4]) self.assert_series_equal(result, expected) result = ser.loc[[0, 1, 2, 3]] self.assert_series_equal(result, expected) def test_loc_frame(self, data): df = pd.DataFrame({ "A": data, "B": np.arange(len(data), dtype="int64") }) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame result = df.loc[:3, ["A"]] self.assert_frame_equal(result, expected) # sequence -> frame result = df.loc[[0, 1, 2, 3], ["A"]] self.assert_frame_equal(result, expected) expected = pd.Series(data[:4], name="A") # slice -> series result = df.loc[:3, "A"] self.assert_series_equal(result, expected) # sequence -> series result = df.loc[:3, "A"] self.assert_series_equal(result, expected) def test_loc_iloc_frame_single_dtype(self, data): # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly # return a scalar df = pd.DataFrame({"A": data}) expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype) result = df.loc[2] self.assert_series_equal(result, expected) expected = pd.Series([data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype) result = df.iloc[-1] self.assert_series_equal(result, expected) def test_getitem_scalar(self, data): result = data[0] assert isinstance(result, data.dtype.type) result = pd.Series(data)[0] assert isinstance(result, data.dtype.type) def test_getitem_invalid(self, data): # TODO: box over scalar, [scalar], (scalar,)? msg = ( r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis " r"\(`None`\) and integer or boolean arrays are valid indices") with pytest.raises(IndexError, match=msg): data["foo"] with pytest.raises(IndexError, match=msg): data[2.5] ub = len(data) msg = "|".join([ "list index out of range", # json "index out of bounds", # pyarrow "Out of bounds access", # Sparse f"loc must be an integer between -{ub} and {ub}", # Sparse f"index {ub+1} is out of bounds for axis 0 with size {ub}", f"index -{ub+1} is out of bounds for axis 0 with size {ub}", ]) with pytest.raises(IndexError, match=msg): data[ub + 1] with pytest.raises(IndexError, match=msg): data[-ub - 1] def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) def test_getitem_empty(self, data): # Indexing with empty list result = data[[]] assert len(result) == 0 assert isinstance(result, type(data)) expected = data[np.array([], dtype="int64")] self.assert_extension_array_equal(result, expected) def test_getitem_mask(self, data): # Empty mask, raw array mask = np.zeros(len(data), dtype=bool) result = data[mask] assert len(result) == 0 assert isinstance(result, type(data)) # Empty mask, in series mask = np.zeros(len(data), dtype=bool) result = pd.Series(data)[mask] assert len(result) == 0 assert result.dtype == data.dtype # non-empty mask, raw array mask[0] = True result = data[mask] assert len(result) == 1 assert isinstance(result, type(data)) # non-empty mask, in series result = pd.Series(data)[mask] assert len(result) == 1 assert result.dtype == data.dtype def test_getitem_mask_raises(self, data): mask = np.array([True, False]) msg = f"Boolean index has wrong length: 2 instead of {len(data)}" with pytest.raises(IndexError, match=msg): data[mask] mask = pd.array(mask, dtype="boolean") with pytest.raises(IndexError, match=msg): data[mask] def test_getitem_boolean_array_mask(self, data): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") result = data[mask] assert len(result) == 0 assert isinstance(result, type(data)) result = pd.Series(data)[mask] assert len(result) == 0 assert result.dtype == data.dtype mask[:5] = True expected = data.take([0, 1, 2, 3, 4]) result = data[mask] self.assert_extension_array_equal(result, expected) expected = pd.Series(expected) result = pd.Series(data)[mask] self.assert_series_equal(result, expected) def test_getitem_boolean_na_treated_as_false(self, data): # https://github.com/pandas-dev/pandas/issues/31503 mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA mask[2:4] = True result = data[mask] expected = data[mask.fillna(False)] self.assert_extension_array_equal(result, expected) s = pd.Series(data) result = s[mask] expected = s[mask.fillna(False)] self.assert_series_equal(result, expected) @pytest.mark.parametrize( "idx", [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], ids=["list", "integer-array", "numpy-array"], ) def test_getitem_integer_array(self, data, idx): result = data[idx] assert len(result) == 3 assert isinstance(result, type(data)) expected = data.take([0, 1, 2]) self.assert_extension_array_equal(result, expected) expected = pd.Series(expected) result = pd.Series(data)[idx] self.assert_series_equal(result, expected) @pytest.mark.parametrize( "idx", [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], ids=["list", "integer-array"], ) def test_getitem_integer_with_missing_raises(self, data, idx): msg = "Cannot index with an integer indexer containing NA values" with pytest.raises(ValueError, match=msg): data[idx] @pytest.mark.xfail(reason="Tries label-based and raises KeyError; " "in some cases raises when calling np.asarray") @pytest.mark.parametrize( "idx", [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], ids=["list", "integer-array"], ) def test_getitem_series_integer_with_missing_raises(self, data, idx): msg = "Cannot index with an integer indexer containing NA values" # TODO: this raises KeyError about labels not found (it tries label-based) ser = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) with pytest.raises(ValueError, match=msg): ser[idx] def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty assert isinstance(result, type(data)) result = data[slice(1)] # scalar assert isinstance(result, type(data)) def test_getitem_ellipsis_and_slice(self, data): # GH#40353 this is called from getitem_block_index result = data[..., :] self.assert_extension_array_equal(result, data) result = data[:, ...] self.assert_extension_array_equal(result, data) result = data[..., :3] self.assert_extension_array_equal(result, data[:3]) result = data[:3, ...] self.assert_extension_array_equal(result, data[:3]) result = data[..., ::2] self.assert_extension_array_equal(result, data[::2]) result = data[::2, ...] self.assert_extension_array_equal(result, data[::2]) def test_get(self, data): # GH 20882 s = pd.Series(data, index=[2 * i for i in range(len(data))]) assert s.get(4) == s.iloc[2] result = s.get([4, 6]) expected = s.iloc[[2, 3]] self.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match="label-based"): result = s.get(slice(2)) expected = s.iloc[[0, 1]] self.assert_series_equal(result, expected) assert s.get(-1) is None assert s.get(s.index.max() + 1) is None s = pd.Series(data[:6], index=list("abcdef")) assert s.get("c") == s.iloc[2] result = s.get(slice("b", "d")) expected = s.iloc[[1, 2, 3]] self.assert_series_equal(result, expected) result = s.get("Z") assert result is None assert s.get(4) == s.iloc[4] assert s.get(-1) == s.iloc[-1] assert s.get(len(s)) is None # GH 21257 s = pd.Series(data) with tm.assert_produces_warning(None): # GH#45324 make sure we aren't giving a spurious FutureWarning s2 = s[::2] assert s2.get(1) is None def test_take_sequence(self, data): result = pd.Series(data)[[0, 1, 3]] assert result.iloc[0] == data[0] assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] def test_take(self, data, na_value, na_cmp): result = data.take([0, -1]) assert result.dtype == data.dtype assert result[0] == data[0] assert result[1] == data[-1] result = data.take([0, -1], allow_fill=True, fill_value=na_value) assert result[0] == data[0] assert na_cmp(result[1], na_value) with pytest.raises(IndexError, match="out of bounds"): data.take([len(data) + 1]) def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] result = empty.take([-1], allow_fill=True) assert na_cmp(result[0], na_value) msg = "cannot do a non-empty take from an empty axes|out of bounds" with pytest.raises(IndexError, match=msg): empty.take([-1]) with pytest.raises(IndexError, match="cannot do a non-empty take"): empty.take([0, 1]) def test_take_negative(self, data): # https://github.com/pandas-dev/pandas/issues/20640 n = len(data) result = data.take([0, -n, n - 1, -1]) expected = data.take([0, 0, n - 1, n - 1]) self.assert_extension_array_equal(result, expected) def test_take_non_na_fill_value(self, data_missing): fill_value = data_missing[1] # valid na = data_missing[0] arr = data_missing._from_sequence([na, fill_value, na], dtype=data_missing.dtype) result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True) expected = arr.take([1, 1]) self.assert_extension_array_equal(result, expected) def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError, match=""): data.take([0, -2], fill_value=na_value, allow_fill=True) @pytest.mark.parametrize("allow_fill", [True, False]) def test_take_out_of_bounds_raises(self, data, allow_fill): arr = data[:3] with pytest.raises(IndexError, match="out of bounds|out-of-bounds"): arr.take(np.asarray([0, 3]), allow_fill=allow_fill) def test_take_series(self, data): s = pd.Series(data) result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), index=[0, len(data) - 1], ) self.assert_series_equal(result, expected) def test_reindex(self, data, na_value): s = pd.Series(data) result = s.reindex([0, 1, 3]) expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) self.assert_series_equal(result, expected) n = len(data) result = s.reindex([-1, 0, n]) expected = pd.Series( data._from_sequence([na_value, data[0], na_value], dtype=s.dtype), index=[-1, 0, n], ) self.assert_series_equal(result, expected) result = s.reindex([n, n + 1]) expected = pd.Series(data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]) self.assert_series_equal(result, expected) def test_reindex_non_na_fill_value(self, data_missing): valid = data_missing[1] na = data_missing[0] arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype) ser = pd.Series(arr) result = ser.reindex([0, 1, 2], fill_value=valid) expected = pd.Series( data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)) self.assert_series_equal(result, expected) def test_loc_len1(self, data): # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res.ndim == 1 assert res._mgr.arrays[0].ndim == 1 if hasattr(res._mgr, "blocks"): assert res._mgr._block.ndim == 1 def test_item(self, data): # https://github.com/pandas-dev/pandas/pull/30175 s = pd.Series(data) result = s[:1].item() assert result == data[0] msg = "can only convert an array of size 1 to a Python scalar" with pytest.raises(ValueError, match=msg): s[:0].item() with pytest.raises(ValueError, match=msg): s.item() def test_ellipsis_index(self): # GH42430 1D slices over extension types turn into N-dimensional slices over # ExtensionArrays class CapturingStringArray(pd.arrays.StringArray): """Extend StringArray to capture arguments to __getitem__""" def __getitem__(self, item): self.last_item_arg = item return super().__getitem__(item) df = pd.DataFrame({ "col1": CapturingStringArray(np.array(["hello", "world"], dtype=object)) }) _ = df.iloc[:1] # String comparison because there's no native way to compare slices. # Before the fix for GH42430, last_item_arg would get set to the 2D slice # (Ellipsis, slice(None, 1, None)) self.assert_equal(str(df["col1"].array.last_item_arg), "slice(None, 1, None)")
def time_from_bool_array(self): pd.array(self.values_bool, dtype="boolean")
class TestDataFrameCorr: # DataFrame.corr(), as opposed to DataFrame.corrwith @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) @td.skip_if_no_scipy def test_corr_scipy_method(self, float_frame, method): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan float_frame["A"][:10] = float_frame["A"][10:20] correls = float_frame.corr(method=method) expected = float_frame["A"].corr(float_frame["C"], method=method) tm.assert_almost_equal(correls["A"]["C"], expected) # --------------------------------------------------------------------- def test_corr_non_numeric(self, float_string_frame): # exclude non-numeric types result = float_string_frame.corr() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) def test_corr_nooverlap(self, meth): # nothing in common df = DataFrame({ "A": [1, 1.5, 1, np.nan, np.nan, np.nan], "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], }) rs = df.corr(meth) assert isna(rs.loc["A", "B"]) assert isna(rs.loc["B", "A"]) assert rs.loc["A", "A"] == 1 assert rs.loc["B", "B"] == 1 assert isna(rs.loc["C", "C"]) @pytest.mark.parametrize("meth", ["pearson", "spearman"]) def test_corr_constant(self, meth): # constant --> all NA df = DataFrame({ "A": [1, 1, 1, np.nan, np.nan, np.nan], "B": [np.nan, np.nan, np.nan, 1, 1, 1], }) rs = df.corr(meth) assert isna(rs.values).all() @td.skip_if_no_scipy @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) def test_corr_int_and_boolean(self, meth): # when dtypes of pandas series are different # then ndarray will have dtype=object, # so it need to be properly handled df = DataFrame({"a": [True, False], "b": [1, 0]}) expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) result = df.corr(meth) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", ["cov", "corr"]) def test_corr_cov_independent_index_column(self, method): # GH#14617 df = DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) result = getattr(df, method)() assert result.index is not result.columns assert result.index.equals(result.columns) def test_corr_invalid_method(self): # GH#22298 df = DataFrame(np.random.normal(size=(10, 2))) msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): df.corr(method="____") def test_corr_int(self): # dtypes other than float64 GH#1761 df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) df.cov() df.corr() @td.skip_if_no_scipy @pytest.mark.parametrize( "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]) @pytest.mark.parametrize( "other_column", [ pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan]) ], ) @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) def test_corr_nullable_integer(self, nullable_column, other_column, method): # https://github.com/pandas-dev/pandas/issues/33803 data = DataFrame({"a": nullable_column, "b": other_column}) result = data.corr(method=method) expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) def test_corr_item_cache(self): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache assert len(df._mgr.arrays) == 2 # i.e. 2 blocks _ = df.corr() # Check that the corr didn't break link between ser and df ser.values[0] = 99 assert df.loc[0, "A"] == 99 assert df["A"] is ser assert df.values[0, 0] == 99 @pytest.mark.parametrize("length", [2, 20, 200, 2000]) def test_corr_for_constant_columns(self, length): # GH: 37448 df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"]) result = df.corr() expected = DataFrame({ "A": [np.nan, np.nan], "B": [np.nan, np.nan] }, index=["A", "B"]) tm.assert_frame_equal(result, expected) def test_calc_corr_small_numbers(self): # GH: 37452 df = DataFrame({ "A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20] }) result = df.corr() expected = DataFrame({ "A": [1.0, 1.0], "B": [1.0, 1.0] }, index=["A", "B"]) tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) def test_corr_min_periods_greater_than_length(self, method): df = DataFrame({"A": [1, 2], "B": [1, 2]}) result = df.corr(method=method, min_periods=3) expected = DataFrame({ "A": [np.nan, np.nan], "B": [np.nan, np.nan] }, index=["A", "B"]) tm.assert_frame_equal(result, expected)
def test_integer_array_numpy_sum(values, expected): arr = pd.array(values, dtype="Int64") result = np.sum(arr) assert result == expected
def test_array(data, dtype, expected): result = pd.array(data, dtype=dtype) tm.assert_equal(result, expected)
def test_to_numpy_na_raises(self, dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): a.to_numpy(dtype=dtype)
def test_array_inference(data, expected): result = pd.array(data) tm.assert_equal(result, expected)
def test_astype_str(self): a = pd.array([1, 2, None], dtype="Int64") expected = np.array(["1", "2", "<NA>"], dtype=object) tm.assert_numpy_array_equal(a.astype(str), expected) tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_nd_raises(data): with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'): pd.array(data)
from sklearn.tree import DecisionTreeClassifier from sklearn.feature_selection import RFE, RFECV from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score features =df[features_list2[1:]] features = features.fillna(features.mean()) for name in df.name: if name in my_dataset.keys(): for columns in features.columns: my_dataset[name][columns] = float(df[columns][df.name == name]) labels =df[features_list2[0]] labels = np.array(labels) features = np.array(features) ### RFECV method ### RFECV method and try 4 different classifier method: logistic regression, Decision Tree, Random Forrest and Adaptive boosting ### logistic regression clf_Log = LogisticRegression(random_state = 14, C= 5, class_weight='balanced') selectorCV_Log = RFECV(clf_Log, step=1, cv=5, scoring = 'f1') selectorCV_Log.fit(features, labels) refcv_figure(selectorCV_Log) clf = selectorCV_Log.estimator_ features_new = selectorCV_Log.transform(features)
"a": 1 }, "dict"), (gen_adata((3, 2)), "anndata"), (sparse.random(5, 3, format="csr", density=0.5), "csr_matrix"), (sparse.random(5, 3, format="csc", density=0.5), "csc_matrix"), (pd.DataFrame({"a": [1, 2, 3]}), "dataframe"), (pd.Categorical(list("aabccedd")), "categorical"), (pd.Categorical(list("aabccedd"), ordered=True), "categorical"), (pd.Categorical([1, 2, 1, 3], ordered=True), "categorical"), ( pd.arrays.IntegerArray(np.ones(5, dtype=int), mask=np.array( [True, False, True, False, True])), "nullable-integer", ), (pd.array([1, 2, 3]), "nullable-integer"), ( pd.arrays.BooleanArray( np.random.randint(0, 2, size=5, dtype=bool), mask=np.random.randint(0, 2, size=5, dtype=bool), ), "nullable-boolean", ), (pd.array([True, False, True, True]), "nullable-boolean"), # (bytes, b"some bytes", "bytes"), # Does not work for zarr # TODO consider how specific encodings should be. Should we be fully describing the written type? # Currently the info we add is: "what you wouldn't be able to figure out yourself" # but that's not really a solid rule. # (bool, True, "bool"), # (bool, np.bool_(False), "bool"), ],
def time_from_integer_like(self): pd.array(self.values_integer_like, dtype="boolean")
def test_uses_pandas_na(self): a = pd.array([1, None], dtype=pd.Int64Dtype()) assert a[1] is pd.NA
def __init__(self, value): self.value = value def __add__(self, other): return self.value + other.value arr = np.array([Dummy(0), Dummy(1)]) ser = pd.Series(arr) tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr))) tm.assert_series_equal(np.add(ser, Dummy(1)), pd.Series(np.add(ser, Dummy(1)))) @pytest.fixture( params=[ pd.array([1, 3, 2], dtype=np.int64), pd.array([1, 3, 2], dtype="Int64"), pd.array([1, 3, 2], dtype="Float32"), pd.array([1, 10, 2], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), pd.to_timedelta(["1 Day", "3 Days", "2 Days"]), pd.IntervalIndex( [pd.Interval(0, 1), pd.Interval(2, 3), pd.Interval(1, 2)]), ], ids=lambda x: str(x.dtype), ) def values_for_np_reduce(request):