Beispiel #1
0
 def test_extension_array(self):
     # a = array([1, 3, np.nan, 2], dtype='Int64')
     a = array([1, 3, 2], dtype='Int64')
     result = safe_sort(a)
     # expected = array([1, 2, 3, np.nan], dtype='Int64')
     expected = array([1, 2, 3], dtype='Int64')
     tm.assert_extension_array_equal(result, expected)
Beispiel #2
0
 def test_extension_array_labels(self, verify, na_sentinel):
     a = array([1, 3, 2], dtype='Int64')
     result, labels = safe_sort(a, [0, 1, na_sentinel, 2],
                                na_sentinel=na_sentinel, verify=verify)
     expected_values = array([1, 2, 3], dtype='Int64')
     expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
     tm.assert_extension_array_equal(result, expected_values)
     tm.assert_numpy_array_equal(labels, expected_labels)
 def test_add_column_with_pandas_array(self):
     # GH 26390
     df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd']})
     df['c'] = pd.array([1, 2, None, 3])
     df2 = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd'],
                         'c': pd.array([1, 2, None, 3])})
     assert type(df['c']._data.blocks[0]) == ObjectBlock
     assert type(df2['c']._data.blocks[0]) == ObjectBlock
     assert_frame_equal(df, df2)
Beispiel #4
0
def test_array_copy():
    a = np.array([1, 2])
    # default is to copy
    b = pd.array(a)
    assert np.shares_memory(a, b._ndarray) is False

    # copy=True
    b = pd.array(a, copy=True)
    assert np.shares_memory(a, b._ndarray) is False

    # copy=False
    b = pd.array(a, copy=False)
    assert np.shares_memory(a, b._ndarray) is True
def test_reductions_frame_dtypes():
    df = pd.DataFrame({'int': [1, 2, 3, 4, 5, 6, 7, 8],
                       'float': [1., 2., 3., 4., np.nan, 6., 7., 8.],
                       'dt': [pd.NaT] + [datetime(2011, i, 1) for i in range(1, 8)],
                       'str': list('abcdefgh')})

    if HAS_INT_NA:
        df['intna'] = pd.array([1, 2, 3, 4, None, 6, 7, 8], dtype=pd.Int64Dtype())

    ddf = dd.from_pandas(df, 3)
    assert_eq(df.sum(), ddf.sum())
    assert_eq(df.prod(), ddf.prod())
    assert_eq(df.min(), ddf.min())
    assert_eq(df.max(), ddf.max())
    assert_eq(df.count(), ddf.count())
    assert_eq(df.std(), ddf.std())
    assert_eq(df.var(), ddf.var())
    assert_eq(df.sem(), ddf.sem())
    assert_eq(df.std(ddof=0), ddf.std(ddof=0))
    assert_eq(df.var(ddof=0), ddf.var(ddof=0))
    assert_eq(df.sem(ddof=0), ddf.sem(ddof=0))

    result = ddf.mean()
    expected = df.mean()
    assert_eq(expected, result)

    assert_eq(df._get_numeric_data(), ddf._get_numeric_data())

    numerics = ddf[['int', 'float']]
    assert numerics._get_numeric_data().dask == numerics.dask
    def test_from_pandas_array(self):
        arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9

        result = DatetimeArray._from_sequence(arr, freq='infer')

        expected = pd.date_range('1970-01-01', periods=5, freq='H')._data
        tm.assert_datetime_array_equal(result, expected)
Beispiel #7
0
def test_array_not_registered(registry_without_decimal):
    # check we aren't on it
    assert registry.find('decimal') is None
    data = [decimal.Decimal('1'), decimal.Decimal('2')]

    result = pd.array(data, dtype=DecimalDtype)
    expected = DecimalArray._from_sequence(data)
    tm.assert_equal(result, expected)
Beispiel #8
0
def test_array_unboxes(box):
    data = box([decimal.Decimal('1'), decimal.Decimal('2')])
    # make sure it works
    with pytest.raises(TypeError):
        DecimalArray2._from_sequence(data)

    result = pd.array(data, dtype='decimal2')
    expected = DecimalArray2._from_sequence(data.values)
    tm.assert_equal(result, expected)
Beispiel #9
0
    def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype):
        arr = pd.array([1, 3, 90], dtype=any_real_dtype)
        result = arr.searchsorted(30)
        assert is_scalar(result)
        assert result == 2

        result = arr.searchsorted([30])
        expected = np.array([2], dtype=np.intp)
        tm.assert_numpy_array_equal(result, expected)
Beispiel #10
0
def _nonempty_series(s, idx=None):
    # TODO: Use register dtypes with make_array_nonempty
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        if len(s.cat.categories):
            data = [s.cat.categories[0]] * 2
            cats = s.cat.categories
        else:
            data = _nonempty_index(s.cat.categories)
            cats = None
        data = pd.Categorical(data, categories=cats,
                              ordered=s.cat.ordered)
    elif is_integer_na_dtype(dtype):
        data = pd.array([1, None], dtype=dtype)
    elif is_period_dtype(dtype):
        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
        freq = dtype.freq
        data = [pd.Period('2000', freq), pd.Period('2001', freq)]
    elif is_sparse(dtype):
        # TODO: pandas <0.24
        # Pandas <= 0.23.4:
        if PANDAS_GT_0240:
            entry = _scalar_from_dtype(dtype.subtype)
        else:
            entry = _scalar_from_dtype(dtype.subtype)
        data = pd.SparseArray([entry, entry], dtype=dtype)
    elif is_interval_dtype(dtype):
        entry = _scalar_from_dtype(dtype.subtype)
        if PANDAS_GT_0240:
            data = pd.array([entry, entry], dtype=dtype)
        else:
            data = np.array([entry, entry], dtype=dtype)
    elif type(dtype) in make_array_nonempty._lookup:
        data = make_array_nonempty(dtype)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    return pd.Series(data, name=s.name, index=idx)
Beispiel #11
0
    def test_searchsorted(self, string_dtype):
        arr = pd.array(['a', 'b', 'c'], dtype=string_dtype)

        result = arr.searchsorted('a', side='left')
        assert is_scalar(result)
        assert result == 0

        result = arr.searchsorted('a', side='right')
        assert is_scalar(result)
        assert result == 1
Beispiel #12
0
def test_make_block_no_pandas_array():
    # https://github.com/pandas-dev/pandas/pull/24866
    arr = pd.array([1, 2])

    # PandasArray, no dtype
    result = make_block(arr, slice(len(arr)))
    assert result.is_integer is True
    assert result.is_extension is False

    # PandasArray, PandasDtype
    result = make_block(arr, slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False

    # ndarray, PandasDtype
    result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False
Beispiel #13
0
def test_dataframe_reductions(op):
    # https://github.com/pandas-dev/pandas/pull/32867
    # ensure the integers are not cast to float during reductions
    df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
    result = df.max()
    assert isinstance(result["a"], np.int64)
Beispiel #14
0
 def test_pandas_array_dtype(self, data):
     # ... but specifying dtype will override idempotency
     result = pd.array(data, dtype=np.dtype(object))
     expected = pd.arrays.PandasArray(np.asarray(data, dtype=object))
     self.assert_equal(result, expected)
Beispiel #15
0
class TestDataFrameCov:
    def test_cov(self, float_frame, float_string_frame):
        # min_periods no NAs (corner case)
        expected = float_frame.cov()
        result = float_frame.cov(min_periods=len(float_frame))

        tm.assert_frame_equal(expected, result)

        result = float_frame.cov(min_periods=len(float_frame) + 1)
        assert isna(result.values).all()

        # with NAs
        frame = float_frame.copy()
        frame["A"][:5] = np.nan
        frame["B"][5:10] = np.nan
        result = frame.cov(min_periods=len(frame) - 8)
        expected = frame.cov()
        expected.loc["A", "B"] = np.nan
        expected.loc["B", "A"] = np.nan
        tm.assert_frame_equal(result, expected)

        # regular
        result = frame.cov()
        expected = frame["A"].cov(frame["C"])
        tm.assert_almost_equal(result["A"]["C"], expected)

        # exclude non-numeric types
        result = float_string_frame.cov()
        expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
        tm.assert_frame_equal(result, expected)

        # Single column frame
        df = DataFrame(np.linspace(0.0, 1.0, 10))
        result = df.cov()
        expected = DataFrame(np.cov(df.values.T).reshape((1, 1)),
                             index=df.columns,
                             columns=df.columns)
        tm.assert_frame_equal(result, expected)
        df.loc[0] = np.nan
        result = df.cov()
        expected = DataFrame(
            np.cov(df.values[1:].T).reshape((1, 1)),
            index=df.columns,
            columns=df.columns,
        )
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
    def test_cov_ddof(self, test_ddof):
        # GH#34611
        np_array1 = np.random.rand(10)
        np_array2 = np.random.rand(10)
        df = DataFrame({0: np_array1, 1: np_array2})
        result = df.cov(ddof=test_ddof)
        expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
        expected = DataFrame(expected_np)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "other_column",
        [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])])
    def test_cov_nullable_integer(self, other_column):
        # https://github.com/pandas-dev/pandas/issues/33803
        data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
        result = data.cov()
        arr = np.array([[0.5, 0.5], [0.5, 1.0]])
        expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
        tm.assert_frame_equal(result, expected)
        if wins[i]== max_wins:
            leader.append(i)
    if(sum(remain)==0):
        for i in range(teams):
            eliminated.append(i)
        for i in range(len(leader)):
            eliminated.remove(leader[i])
        for i in eliminated:
                #print("Teams which got eliminated are -")
                print(i, end=" ")
    else:
        i = 0
        n = 5
        layer1_values = []
        for i in range(teams - 1):
            y = pd.array(data.iloc[i])
            layer1_values.append(y[n:])
            n = n + 1
        Layer1_capacities = list(chain.from_iterable(layer1_values))
        layer_id = 'L1'
        layer_1_ids = []
        for i in range(0, len(Layer1_capacities)):
            node_id = layer_id + str(i)
            layer_1_ids.append(node_id)
        layer_id = 'L2'
        layer_2_ids = []
        for i in range(0, teams):
            node_id = layer_id + str(i)
            layer_2_ids.append(node_id)

Beispiel #17
0
 def test_searchsorted_sorter(self, any_real_dtype):
     arr = pd.array([3, 1, 2], dtype=any_real_dtype)
     result = arr.searchsorted([0, 3], sorter=np.argsort(arr))
     expected = np.array([0, 2], dtype=np.intp)
     tm.assert_numpy_array_equal(result, expected)
Beispiel #18
0
 def time_from_float_array(self):
     pd.array(self.values_float, dtype="boolean")
Beispiel #19
0
def test_min_max(method, skipna):
    arr = pd.Series(["a", "b", "c", None], dtype="string")
    result = getattr(arr, method)(skipna=skipna)
    if skipna:
        expected = "a" if method == "min" else "c"
        assert result == expected
    else:
        assert result is pd.NA


@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize(
    "arr",
    [
        pd.Series(["a", "b", "c", None], dtype="string"),
        pd.array(["a", "b", "c", None], dtype="string"),
    ],
)
def test_min_max_numpy(method, arr):
    result = getattr(np, method)(arr)
    expected = "a" if method == "min" else "c"
    assert result == expected


@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.xfail(reason="Not implemented StringArray.sum")
def test_reduce_missing(skipna):
    arr = pd.Series([None, "a", None, "b", "c", None], dtype="string")
    result = arr.sum(skipna=skipna)
    if skipna:
        assert result == "abc"
def data_for_grouping(dtype):
    b = 0.1
    a = 0.0
    c = 0.2
    na = pd.NA
    return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
Beispiel #21
0
 def test_iloc_nullable_int64_size_1_nan(self):
     # GH 31861
     result = DataFrame({"a": ["test"], "b": [np.nan]})
     result.loc[:, "b"] = result.loc[:, "b"].astype("Int64")
     expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")})
     tm.assert_frame_equal(result, expected)
def data_for_sorting(dtype):
    return pd.array([0.1, 0.2, 0.0], dtype=dtype)
def data_missing_for_sorting(dtype):
    return pd.array([0.1, pd.NA, 0.0], dtype=dtype)
def data_missing(dtype):
    return pd.array([pd.NA, 0.1], dtype=dtype)
def data_for_twos(dtype):
    return pd.array(np.ones(100) * 2, dtype=dtype)
def data(dtype):
    return pd.array(make_data(), dtype=dtype)
Beispiel #27
0
def test_array_inference_fails(data):
    result = pd.array(data)
    expected = PandasArray(np.array(data, dtype=object))
    tm.assert_extension_array_equal(result, expected)
Beispiel #28
0
 def test_replace_extension_other(self):
     # https://github.com/pandas-dev/pandas/issues/34530
     ser = pd.Series(pd.array([1, 2, 3], dtype="Int64"))
     ser.replace("", "")  # no exception
Beispiel #29
0
def test_scalar_raises():
    with pytest.raises(ValueError,
                       match="Cannot pass scalar '1'"):
        pd.array(1)
Beispiel #30
0
 def time_from_integer_array(self):
     pd.array(self.values_integer, dtype="Int64")
Beispiel #31
0
def test_astype_int():
    arr = pd.array(["1", pd.NA, "3"], dtype="string")

    result = arr.astype("Int64")
    expected = pd.array([1, pd.NA, 3], dtype="Int64")
    tm.assert_extension_array_equal(result, expected)
Beispiel #32
0
 def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype):
     arr = pd.array([1, 3, 90], dtype=any_real_dtype)
     result = arr.searchsorted([2, 30])
     expected = np.array([1, 2], dtype=np.intp)
     tm.assert_numpy_array_equal(result, expected)
Beispiel #33
0
                 usecols=[0, 1, 6],
                 parse_dates=[1])

print(df.head())

# In[3]:

start = pd.to_datetime('2019-04-01', format='%Y-%m-%d')
end = pd.to_datetime('2019-07-01', format='%Y-%m-%d')

print(f'start: {start}')
print(f'end: {end}')

# In[4]:

s_full = pd.array(df.iloc[:, 0])
t_full = pd.array(pd.DatetimeIndex(df.iloc[:, 1]).astype(
    np.int64)) / 1000000000

t_full = np.extract([s_full == 2], t_full)

dt = t_full[1] - t_full[0]
print(f'data sampling is {dt:.2f} secs')

# In[5]:

t_start = pd.DatetimeIndex([start]).astype(np.int64) / 1000000000
t_end = pd.DatetimeIndex([end]).astype(np.int64) / 1000000000

t = np.extract([(t_full >= t_start[0]) & (t_full <= t_end[0])], t_full)
Beispiel #34
0
 def test_take_all_empty(self):
     a = pd.array([0, 0], dtype=pd.SparseDtype("int64"))
     result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
     tm.assert_sp_array_equal(a, result)
Beispiel #35
0
    def test_from_array(self):
        result = pd.Series(pd.array(['1H', '2H'], dtype='timedelta64[ns]'))
        assert result._data.blocks[0].is_extension is False

        result = pd.Series(pd.array(['2015'], dtype='datetime64[ns]'))
        assert result._data.blocks[0].is_extension is False
Beispiel #36
0
 def test_pandas_array(self, data):
     # pd.array(extension_array) should be idempotent...
     result = pd.array(data)
     self.assert_extension_array_equal(result, data)
Beispiel #37
0
 def test_pandas_array_dtype(self, data):
     # ... but specifying dtype will override idempotency
     result = pd.array(data, dtype=np.dtype(object))
     expected = pd.arrays.PandasArray(np.asarray(data, dtype=object))
     self.assert_equal(result, expected)
Beispiel #38
0
def test_is_string_dtype_nullable(nullable_string_dtype):
    assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
Beispiel #39
0
 def test_search_sorted_datetime64_scalar(self, arr, val):
     arr = pd.array(arr)
     result = arr.searchsorted(val)
     assert is_scalar(result)
     assert result == 1
Beispiel #40
0
class BaseGetitemTests(BaseExtensionTests):
    """Tests for ExtensionArray.__getitem__."""
    def test_iloc_series(self, data):
        ser = pd.Series(data)
        result = ser.iloc[:4]
        expected = pd.Series(data[:4])
        self.assert_series_equal(result, expected)

        result = ser.iloc[[0, 1, 2, 3]]
        self.assert_series_equal(result, expected)

    def test_iloc_frame(self, data):
        df = pd.DataFrame({
            "A": data,
            "B": np.arange(len(data), dtype="int64")
        })
        expected = pd.DataFrame({"A": data[:4]})

        # slice -> frame
        result = df.iloc[:4, [0]]
        self.assert_frame_equal(result, expected)

        # sequence -> frame
        result = df.iloc[[0, 1, 2, 3], [0]]
        self.assert_frame_equal(result, expected)

        expected = pd.Series(data[:4], name="A")

        # slice -> series
        result = df.iloc[:4, 0]
        self.assert_series_equal(result, expected)

        # sequence -> series
        result = df.iloc[:4, 0]
        self.assert_series_equal(result, expected)

        # GH#32959 slice columns with step
        result = df.iloc[:, ::2]
        self.assert_frame_equal(result, df[["A"]])
        result = df[["B", "A"]].iloc[:, ::2]
        self.assert_frame_equal(result, df[["B"]])

    def test_iloc_frame_single_block(self, data):
        # GH#32959 null slice along index, slice along columns with single-block
        df = pd.DataFrame({"A": data})

        result = df.iloc[:, :]
        self.assert_frame_equal(result, df)

        result = df.iloc[:, :1]
        self.assert_frame_equal(result, df)

        result = df.iloc[:, :2]
        self.assert_frame_equal(result, df)

        result = df.iloc[:, ::2]
        self.assert_frame_equal(result, df)

        result = df.iloc[:, 1:2]
        self.assert_frame_equal(result, df.iloc[:, :0])

        result = df.iloc[:, -1:]
        self.assert_frame_equal(result, df)

    def test_loc_series(self, data):
        ser = pd.Series(data)
        result = ser.loc[:3]
        expected = pd.Series(data[:4])
        self.assert_series_equal(result, expected)

        result = ser.loc[[0, 1, 2, 3]]
        self.assert_series_equal(result, expected)

    def test_loc_frame(self, data):
        df = pd.DataFrame({
            "A": data,
            "B": np.arange(len(data), dtype="int64")
        })
        expected = pd.DataFrame({"A": data[:4]})

        # slice -> frame
        result = df.loc[:3, ["A"]]
        self.assert_frame_equal(result, expected)

        # sequence -> frame
        result = df.loc[[0, 1, 2, 3], ["A"]]
        self.assert_frame_equal(result, expected)

        expected = pd.Series(data[:4], name="A")

        # slice -> series
        result = df.loc[:3, "A"]
        self.assert_series_equal(result, expected)

        # sequence -> series
        result = df.loc[:3, "A"]
        self.assert_series_equal(result, expected)

    def test_loc_iloc_frame_single_dtype(self, data):
        # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
        #  return a scalar
        df = pd.DataFrame({"A": data})
        expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)

        result = df.loc[2]
        self.assert_series_equal(result, expected)

        expected = pd.Series([data[-1]],
                             index=["A"],
                             name=len(data) - 1,
                             dtype=data.dtype)
        result = df.iloc[-1]
        self.assert_series_equal(result, expected)

    def test_getitem_scalar(self, data):
        result = data[0]
        assert isinstance(result, data.dtype.type)

        result = pd.Series(data)[0]
        assert isinstance(result, data.dtype.type)

    def test_getitem_invalid(self, data):
        # TODO: box over scalar, [scalar], (scalar,)?

        msg = (
            r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
            r"\(`None`\) and integer or boolean arrays are valid indices")
        with pytest.raises(IndexError, match=msg):
            data["foo"]
        with pytest.raises(IndexError, match=msg):
            data[2.5]

        ub = len(data)
        msg = "|".join([
            "list index out of range",  # json
            "index out of bounds",  # pyarrow
            "Out of bounds access",  # Sparse
            f"loc must be an integer between -{ub} and {ub}",  # Sparse
            f"index {ub+1} is out of bounds for axis 0 with size {ub}",
            f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
        ])
        with pytest.raises(IndexError, match=msg):
            data[ub + 1]
        with pytest.raises(IndexError, match=msg):
            data[-ub - 1]

    def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
        result = data_missing[0]
        assert na_cmp(result, na_value)

    def test_getitem_empty(self, data):
        # Indexing with empty list
        result = data[[]]
        assert len(result) == 0
        assert isinstance(result, type(data))

        expected = data[np.array([], dtype="int64")]
        self.assert_extension_array_equal(result, expected)

    def test_getitem_mask(self, data):
        # Empty mask, raw array
        mask = np.zeros(len(data), dtype=bool)
        result = data[mask]
        assert len(result) == 0
        assert isinstance(result, type(data))

        # Empty mask, in series
        mask = np.zeros(len(data), dtype=bool)
        result = pd.Series(data)[mask]
        assert len(result) == 0
        assert result.dtype == data.dtype

        # non-empty mask, raw array
        mask[0] = True
        result = data[mask]
        assert len(result) == 1
        assert isinstance(result, type(data))

        # non-empty mask, in series
        result = pd.Series(data)[mask]
        assert len(result) == 1
        assert result.dtype == data.dtype

    def test_getitem_mask_raises(self, data):
        mask = np.array([True, False])
        msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
        with pytest.raises(IndexError, match=msg):
            data[mask]

        mask = pd.array(mask, dtype="boolean")
        with pytest.raises(IndexError, match=msg):
            data[mask]

    def test_getitem_boolean_array_mask(self, data):
        mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
        result = data[mask]
        assert len(result) == 0
        assert isinstance(result, type(data))

        result = pd.Series(data)[mask]
        assert len(result) == 0
        assert result.dtype == data.dtype

        mask[:5] = True
        expected = data.take([0, 1, 2, 3, 4])
        result = data[mask]
        self.assert_extension_array_equal(result, expected)

        expected = pd.Series(expected)
        result = pd.Series(data)[mask]
        self.assert_series_equal(result, expected)

    def test_getitem_boolean_na_treated_as_false(self, data):
        # https://github.com/pandas-dev/pandas/issues/31503
        mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
        mask[:2] = pd.NA
        mask[2:4] = True

        result = data[mask]
        expected = data[mask.fillna(False)]

        self.assert_extension_array_equal(result, expected)

        s = pd.Series(data)

        result = s[mask]
        expected = s[mask.fillna(False)]

        self.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "idx",
        [[0, 1, 2],
         pd.array([0, 1, 2], dtype="Int64"),
         np.array([0, 1, 2])],
        ids=["list", "integer-array", "numpy-array"],
    )
    def test_getitem_integer_array(self, data, idx):
        result = data[idx]
        assert len(result) == 3
        assert isinstance(result, type(data))
        expected = data.take([0, 1, 2])
        self.assert_extension_array_equal(result, expected)

        expected = pd.Series(expected)
        result = pd.Series(data)[idx]
        self.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "idx",
        [[0, 1, 2, pd.NA],
         pd.array([0, 1, 2, pd.NA], dtype="Int64")],
        ids=["list", "integer-array"],
    )
    def test_getitem_integer_with_missing_raises(self, data, idx):
        msg = "Cannot index with an integer indexer containing NA values"
        with pytest.raises(ValueError, match=msg):
            data[idx]

    @pytest.mark.xfail(reason="Tries label-based and raises KeyError; "
                       "in some cases raises when calling np.asarray")
    @pytest.mark.parametrize(
        "idx",
        [[0, 1, 2, pd.NA],
         pd.array([0, 1, 2, pd.NA], dtype="Int64")],
        ids=["list", "integer-array"],
    )
    def test_getitem_series_integer_with_missing_raises(self, data, idx):
        msg = "Cannot index with an integer indexer containing NA values"
        # TODO: this raises KeyError about labels not found (it tries label-based)

        ser = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
        with pytest.raises(ValueError, match=msg):
            ser[idx]

    def test_getitem_slice(self, data):
        # getitem[slice] should return an array
        result = data[slice(0)]  # empty
        assert isinstance(result, type(data))

        result = data[slice(1)]  # scalar
        assert isinstance(result, type(data))

    def test_getitem_ellipsis_and_slice(self, data):
        # GH#40353 this is called from getitem_block_index
        result = data[..., :]
        self.assert_extension_array_equal(result, data)

        result = data[:, ...]
        self.assert_extension_array_equal(result, data)

        result = data[..., :3]
        self.assert_extension_array_equal(result, data[:3])

        result = data[:3, ...]
        self.assert_extension_array_equal(result, data[:3])

        result = data[..., ::2]
        self.assert_extension_array_equal(result, data[::2])

        result = data[::2, ...]
        self.assert_extension_array_equal(result, data[::2])

    def test_get(self, data):
        # GH 20882
        s = pd.Series(data, index=[2 * i for i in range(len(data))])
        assert s.get(4) == s.iloc[2]

        result = s.get([4, 6])
        expected = s.iloc[[2, 3]]
        self.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning, match="label-based"):
            result = s.get(slice(2))
        expected = s.iloc[[0, 1]]
        self.assert_series_equal(result, expected)

        assert s.get(-1) is None
        assert s.get(s.index.max() + 1) is None

        s = pd.Series(data[:6], index=list("abcdef"))
        assert s.get("c") == s.iloc[2]

        result = s.get(slice("b", "d"))
        expected = s.iloc[[1, 2, 3]]
        self.assert_series_equal(result, expected)

        result = s.get("Z")
        assert result is None

        assert s.get(4) == s.iloc[4]
        assert s.get(-1) == s.iloc[-1]
        assert s.get(len(s)) is None

        # GH 21257
        s = pd.Series(data)
        with tm.assert_produces_warning(None):
            # GH#45324 make sure we aren't giving a spurious FutureWarning
            s2 = s[::2]
        assert s2.get(1) is None

    def test_take_sequence(self, data):
        result = pd.Series(data)[[0, 1, 3]]
        assert result.iloc[0] == data[0]
        assert result.iloc[1] == data[1]
        assert result.iloc[2] == data[3]

    def test_take(self, data, na_value, na_cmp):
        result = data.take([0, -1])
        assert result.dtype == data.dtype
        assert result[0] == data[0]
        assert result[1] == data[-1]

        result = data.take([0, -1], allow_fill=True, fill_value=na_value)
        assert result[0] == data[0]
        assert na_cmp(result[1], na_value)

        with pytest.raises(IndexError, match="out of bounds"):
            data.take([len(data) + 1])

    def test_take_empty(self, data, na_value, na_cmp):
        empty = data[:0]

        result = empty.take([-1], allow_fill=True)
        assert na_cmp(result[0], na_value)

        msg = "cannot do a non-empty take from an empty axes|out of bounds"

        with pytest.raises(IndexError, match=msg):
            empty.take([-1])

        with pytest.raises(IndexError, match="cannot do a non-empty take"):
            empty.take([0, 1])

    def test_take_negative(self, data):
        # https://github.com/pandas-dev/pandas/issues/20640
        n = len(data)
        result = data.take([0, -n, n - 1, -1])
        expected = data.take([0, 0, n - 1, n - 1])
        self.assert_extension_array_equal(result, expected)

    def test_take_non_na_fill_value(self, data_missing):
        fill_value = data_missing[1]  # valid
        na = data_missing[0]

        arr = data_missing._from_sequence([na, fill_value, na],
                                          dtype=data_missing.dtype)
        result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
        expected = arr.take([1, 1])
        self.assert_extension_array_equal(result, expected)

    def test_take_pandas_style_negative_raises(self, data, na_value):
        with pytest.raises(ValueError, match=""):
            data.take([0, -2], fill_value=na_value, allow_fill=True)

    @pytest.mark.parametrize("allow_fill", [True, False])
    def test_take_out_of_bounds_raises(self, data, allow_fill):
        arr = data[:3]

        with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
            arr.take(np.asarray([0, 3]), allow_fill=allow_fill)

    def test_take_series(self, data):
        s = pd.Series(data)
        result = s.take([0, -1])
        expected = pd.Series(
            data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
            index=[0, len(data) - 1],
        )
        self.assert_series_equal(result, expected)

    def test_reindex(self, data, na_value):
        s = pd.Series(data)
        result = s.reindex([0, 1, 3])
        expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
        self.assert_series_equal(result, expected)

        n = len(data)
        result = s.reindex([-1, 0, n])
        expected = pd.Series(
            data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
            index=[-1, 0, n],
        )
        self.assert_series_equal(result, expected)

        result = s.reindex([n, n + 1])
        expected = pd.Series(data._from_sequence([na_value, na_value],
                                                 dtype=s.dtype),
                             index=[n, n + 1])
        self.assert_series_equal(result, expected)

    def test_reindex_non_na_fill_value(self, data_missing):
        valid = data_missing[1]
        na = data_missing[0]

        arr = data_missing._from_sequence([na, valid],
                                          dtype=data_missing.dtype)
        ser = pd.Series(arr)
        result = ser.reindex([0, 1, 2], fill_value=valid)
        expected = pd.Series(
            data_missing._from_sequence([na, valid, valid],
                                        dtype=data_missing.dtype))

        self.assert_series_equal(result, expected)

    def test_loc_len1(self, data):
        # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
        df = pd.DataFrame({"A": data})
        res = df.loc[[0], "A"]
        assert res.ndim == 1
        assert res._mgr.arrays[0].ndim == 1
        if hasattr(res._mgr, "blocks"):
            assert res._mgr._block.ndim == 1

    def test_item(self, data):
        # https://github.com/pandas-dev/pandas/pull/30175
        s = pd.Series(data)
        result = s[:1].item()
        assert result == data[0]

        msg = "can only convert an array of size 1 to a Python scalar"
        with pytest.raises(ValueError, match=msg):
            s[:0].item()

        with pytest.raises(ValueError, match=msg):
            s.item()

    def test_ellipsis_index(self):
        # GH42430 1D slices over extension types turn into N-dimensional slices over
        #  ExtensionArrays
        class CapturingStringArray(pd.arrays.StringArray):
            """Extend StringArray to capture arguments to __getitem__"""
            def __getitem__(self, item):
                self.last_item_arg = item
                return super().__getitem__(item)

        df = pd.DataFrame({
            "col1":
            CapturingStringArray(np.array(["hello", "world"], dtype=object))
        })
        _ = df.iloc[:1]

        # String comparison because there's no native way to compare slices.
        # Before the fix for GH42430, last_item_arg would get set to the 2D slice
        # (Ellipsis, slice(None, 1, None))
        self.assert_equal(str(df["col1"].array.last_item_arg),
                          "slice(None, 1, None)")
Beispiel #41
0
 def test_pandas_array(self, data):
     # pd.array(extension_array) should be idempotent...
     result = pd.array(data)
     self.assert_extension_array_equal(result, data)
Beispiel #42
0
 def time_from_bool_array(self):
     pd.array(self.values_bool, dtype="boolean")
Beispiel #43
0
class TestDataFrameCorr:
    # DataFrame.corr(), as opposed to DataFrame.corrwith

    @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
    @td.skip_if_no_scipy
    def test_corr_scipy_method(self, float_frame, method):
        float_frame["A"][:5] = np.nan
        float_frame["B"][5:10] = np.nan
        float_frame["A"][:10] = float_frame["A"][10:20]

        correls = float_frame.corr(method=method)
        expected = float_frame["A"].corr(float_frame["C"], method=method)
        tm.assert_almost_equal(correls["A"]["C"], expected)

    # ---------------------------------------------------------------------

    def test_corr_non_numeric(self, float_string_frame):
        # exclude non-numeric types
        result = float_string_frame.corr()
        expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
        tm.assert_frame_equal(result, expected)

    @td.skip_if_no_scipy
    @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
    def test_corr_nooverlap(self, meth):
        # nothing in common
        df = DataFrame({
            "A": [1, 1.5, 1, np.nan, np.nan, np.nan],
            "B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
            "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
        })
        rs = df.corr(meth)
        assert isna(rs.loc["A", "B"])
        assert isna(rs.loc["B", "A"])
        assert rs.loc["A", "A"] == 1
        assert rs.loc["B", "B"] == 1
        assert isna(rs.loc["C", "C"])

    @pytest.mark.parametrize("meth", ["pearson", "spearman"])
    def test_corr_constant(self, meth):
        # constant --> all NA
        df = DataFrame({
            "A": [1, 1, 1, np.nan, np.nan, np.nan],
            "B": [np.nan, np.nan, np.nan, 1, 1, 1],
        })
        rs = df.corr(meth)
        assert isna(rs.values).all()

    @td.skip_if_no_scipy
    @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
    def test_corr_int_and_boolean(self, meth):
        # when dtypes of pandas series are different
        # then ndarray will have dtype=object,
        # so it need to be properly handled
        df = DataFrame({"a": [True, False], "b": [1, 0]})

        expected = DataFrame(np.ones((2, 2)),
                             index=["a", "b"],
                             columns=["a", "b"])

        with warnings.catch_warnings(record=True):
            warnings.simplefilter("ignore", RuntimeWarning)
            result = df.corr(meth)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("method", ["cov", "corr"])
    def test_corr_cov_independent_index_column(self, method):
        # GH#14617
        df = DataFrame(np.random.randn(4 * 10).reshape(10, 4),
                       columns=list("abcd"))
        result = getattr(df, method)()
        assert result.index is not result.columns
        assert result.index.equals(result.columns)

    def test_corr_invalid_method(self):
        # GH#22298
        df = DataFrame(np.random.normal(size=(10, 2)))
        msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
        with pytest.raises(ValueError, match=msg):
            df.corr(method="____")

    def test_corr_int(self):
        # dtypes other than float64 GH#1761
        df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})

        df.cov()
        df.corr()

    @td.skip_if_no_scipy
    @pytest.mark.parametrize(
        "nullable_column",
        [pd.array([1, 2, 3]), pd.array([1, 2, None])])
    @pytest.mark.parametrize(
        "other_column",
        [
            pd.array([1, 2, 3]),
            np.array([1.0, 2.0, 3.0]),
            np.array([1.0, 2.0, np.nan])
        ],
    )
    @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
    def test_corr_nullable_integer(self, nullable_column, other_column,
                                   method):
        # https://github.com/pandas-dev/pandas/issues/33803
        data = DataFrame({"a": nullable_column, "b": other_column})
        result = data.corr(method=method)
        expected = DataFrame(np.ones((2, 2)),
                             columns=["a", "b"],
                             index=["a", "b"])
        tm.assert_frame_equal(result, expected)

    def test_corr_item_cache(self):
        # Check that corr does not lead to incorrect entries in item_cache

        df = DataFrame({"A": range(10)})
        df["B"] = range(10)[::-1]

        ser = df["A"]  # populate item_cache
        assert len(df._mgr.arrays) == 2  # i.e. 2 blocks

        _ = df.corr()

        # Check that the corr didn't break link between ser and df
        ser.values[0] = 99
        assert df.loc[0, "A"] == 99
        assert df["A"] is ser
        assert df.values[0, 0] == 99

    @pytest.mark.parametrize("length", [2, 20, 200, 2000])
    def test_corr_for_constant_columns(self, length):
        # GH: 37448
        df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
        result = df.corr()
        expected = DataFrame({
            "A": [np.nan, np.nan],
            "B": [np.nan, np.nan]
        },
                             index=["A", "B"])
        tm.assert_frame_equal(result, expected)

    def test_calc_corr_small_numbers(self):
        # GH: 37452
        df = DataFrame({
            "A": [1.0e-20, 2.0e-20, 3.0e-20],
            "B": [1.0e-20, 2.0e-20, 3.0e-20]
        })
        result = df.corr()
        expected = DataFrame({
            "A": [1.0, 1.0],
            "B": [1.0, 1.0]
        },
                             index=["A", "B"])
        tm.assert_frame_equal(result, expected)

    @td.skip_if_no_scipy
    @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
    def test_corr_min_periods_greater_than_length(self, method):
        df = DataFrame({"A": [1, 2], "B": [1, 2]})
        result = df.corr(method=method, min_periods=3)
        expected = DataFrame({
            "A": [np.nan, np.nan],
            "B": [np.nan, np.nan]
        },
                             index=["A", "B"])
        tm.assert_frame_equal(result, expected)
Beispiel #44
0
def test_integer_array_numpy_sum(values, expected):
    arr = pd.array(values, dtype="Int64")
    result = np.sum(arr)
    assert result == expected
Beispiel #45
0
def test_array(data, dtype, expected):
    result = pd.array(data, dtype=dtype)
    tm.assert_equal(result, expected)
Beispiel #46
0
 def test_to_numpy_na_raises(self, dtype):
     a = pd.array([0, 1, None], dtype="Int64")
     with pytest.raises(ValueError, match=dtype):
         a.to_numpy(dtype=dtype)
Beispiel #47
0
def test_array_inference(data, expected):
    result = pd.array(data)
    tm.assert_equal(result, expected)
Beispiel #48
0
    def test_astype_str(self):
        a = pd.array([1, 2, None], dtype="Int64")
        expected = np.array(["1", "2", "<NA>"], dtype=object)

        tm.assert_numpy_array_equal(a.astype(str), expected)
        tm.assert_numpy_array_equal(a.astype("str"), expected)
Beispiel #49
0
def test_nd_raises(data):
    with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'):
        pd.array(data)
Beispiel #50
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

features =df[features_list2[1:]]
features = features.fillna(features.mean())

for name in df.name:
    if name in my_dataset.keys():
        for columns in features.columns:
            my_dataset[name][columns] = float(df[columns][df.name == name])

labels =df[features_list2[0]]
labels = np.array(labels)
features = np.array(features)


### RFECV method

### RFECV method and try 4 different classifier method: logistic regression, Decision Tree, Random Forrest and Adaptive boosting 

### logistic regression

clf_Log = LogisticRegression(random_state = 14, C= 5, class_weight='balanced')
selectorCV_Log = RFECV(clf_Log, step=1, cv=5, scoring = 'f1')
selectorCV_Log.fit(features, labels)
refcv_figure(selectorCV_Log)
clf = selectorCV_Log.estimator_
features_new = selectorCV_Log.transform(features)
Beispiel #51
0
         "a": 1
     }, "dict"),
     (gen_adata((3, 2)), "anndata"),
     (sparse.random(5, 3, format="csr", density=0.5), "csr_matrix"),
     (sparse.random(5, 3, format="csc", density=0.5), "csc_matrix"),
     (pd.DataFrame({"a": [1, 2, 3]}), "dataframe"),
     (pd.Categorical(list("aabccedd")), "categorical"),
     (pd.Categorical(list("aabccedd"), ordered=True), "categorical"),
     (pd.Categorical([1, 2, 1, 3], ordered=True), "categorical"),
     (
         pd.arrays.IntegerArray(np.ones(5, dtype=int),
                                mask=np.array(
                                    [True, False, True, False, True])),
         "nullable-integer",
     ),
     (pd.array([1, 2, 3]), "nullable-integer"),
     (
         pd.arrays.BooleanArray(
             np.random.randint(0, 2, size=5, dtype=bool),
             mask=np.random.randint(0, 2, size=5, dtype=bool),
         ),
         "nullable-boolean",
     ),
     (pd.array([True, False, True, True]), "nullable-boolean"),
     # (bytes, b"some bytes", "bytes"), # Does not work for zarr
     # TODO consider how specific encodings should be. Should we be fully describing the written type?
     # Currently the info we add is: "what you wouldn't be able to figure out yourself"
     # but that's not really a solid rule.
     # (bool, True, "bool"),
     # (bool, np.bool_(False), "bool"),
 ],
Beispiel #52
0
 def time_from_integer_like(self):
     pd.array(self.values_integer_like, dtype="boolean")
Beispiel #53
0
 def test_uses_pandas_na(self):
     a = pd.array([1, None], dtype=pd.Int64Dtype())
     assert a[1] is pd.NA
Beispiel #54
0
        def __init__(self, value):
            self.value = value

        def __add__(self, other):
            return self.value + other.value

    arr = np.array([Dummy(0), Dummy(1)])
    ser = pd.Series(arr)
    tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr)))
    tm.assert_series_equal(np.add(ser, Dummy(1)),
                           pd.Series(np.add(ser, Dummy(1))))


@pytest.fixture(
    params=[
        pd.array([1, 3, 2], dtype=np.int64),
        pd.array([1, 3, 2], dtype="Int64"),
        pd.array([1, 3, 2], dtype="Float32"),
        pd.array([1, 10, 2], dtype="Sparse[int]"),
        pd.to_datetime(["2000", "2010", "2001"]),
        pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"),
        pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"),
        pd.to_timedelta(["1 Day", "3 Days", "2 Days"]),
        pd.IntervalIndex(
            [pd.Interval(0, 1),
             pd.Interval(2, 3),
             pd.Interval(1, 2)]),
    ],
    ids=lambda x: str(x.dtype),
)
def values_for_np_reduce(request):