def test_intersect(self): def _check_correct(a, b, expected): result = a.intersect(b) assert (result.equals(expected)) def _check_length_exc(a, longer): self.assertRaises(Exception, a.intersect, longer) def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) yindex = BlockIndex(TEST_LENGTH, yloc, ylen) expected = BlockIndex(TEST_LENGTH, eloc, elen) longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) _check_correct(xindex, yindex, expected) _check_correct(xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index()) _check_length_exc(xindex, longer_index) _check_length_exc(xindex.to_int_index(), longer_index.to_int_index()) if compat.is_platform_windows(): pytest.skip("segfaults on win-64 when all tests are run") check_cases(_check_case)
def test_encode(self, html_encoding_file): _, encoding = os.path.splitext( os.path.basename(html_encoding_file) )[0].split('_') try: with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() from_filename = self.read_html(html_encoding_file, encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if '16' in encoding or '32' in encoding: pytest.skip() raise
def test_constructor_compound_dtypes(self): # GH 5191 # compound dtypes should raise not-implementederror def f(dtype): data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) msg = ("compound dtypes are not implemented in the DataFrame" " constructor") with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) f('int64') f('float64') # 10822 # invalid error message on dt inference if not compat.is_platform_windows(): f('M8[ns]')
def test_header_not_enough_lines_as_recarray(self): if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") data = ('skip this\n' 'skip this\n' 'a,b,c\n' '1,2,3\n' '4,5,6') reader = TextReader(StringIO(data), delimiter=',', header=2, as_recarray=True) header = reader.header expected = [['a', 'b', 'c']] self.assertEqual(header, expected) recs = reader.read() expected = {'a': [1, 4], 'b': [2, 5], 'c': [3, 6]} assert_array_dicts_equal(expected, recs) # not enough rows self.assertRaises(parser.CParserError, TextReader, StringIO(data), delimiter=',', header=5, as_recarray=True)
def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): prefix = u("file:///") else: prefix = u("file://") fname = prefix + compat.text_type(self.csv1) self.read_csv(fname, index_col=0, parse_dates=True)
def test_convert_rows_list_to_csv_str(self): rows_list = ["aaa", "bbb", "ccc"] ret = tm.convert_rows_list_to_csv_str(rows_list) if compat.is_platform_windows(): expected = "aaa\r\nbbb\r\nccc\r\n" else: expected = "aaa\nbbb\nccc\n" assert ret == expected
def test_invalid_index_types(self): # test all index types for i in [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)]: self.assertRaises(TypeError, lambda: frequencies.infer_freq(i)) # GH 10822 # odd error message on conversions to datetime for unicode if not is_platform_windows(): for i in [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]: self.assertRaises(ValueError, lambda: frequencies.infer_freq(i))
def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest("segfaults on win-64, only when all tests are run") data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO(data), dtype={"one": "u1", 1: "S1"}, as_recarray=True) self.assertEqual(result["one"].dtype, "u1") self.assertEqual(result["two"].dtype, "S1")
def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( "segfaults on win-64, only when all tests are run") data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, as_recarray=True) self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1')
def test_invalid_index_types(self): # test all index types for i in [ tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10) ]: self.assertRaises(TypeError, lambda: frequencies.infer_freq(i)) # GH 10822 # odd error message on conversions to datetime for unicode if not is_platform_windows(): for i in [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]: self.assertRaises(ValueError, lambda: frequencies.infer_freq(i))
def test_constructor_bad_file(self): if is_platform_windows(): raise nose.SkipTest("skipping construction error messages " "tests on windows") non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 msg = "Invalid argument" tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) target = open(self.mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
def test_encode(self): assert self.files, 'no files read from the data folder' for f in self.files: _, encoding = _lang_enc(f) try: from_string = self.read_string(f, encoding).pop() from_file_like = self.read_file_like(f, encoding).pop() from_filename = self.read_filename(f, encoding).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if '16' in encoding or '32' in encoding: continue raise
def test_itertuples(self): for i, tup in enumerate(self.frame.itertuples()): s = self.klass._constructor_sliced(tup[1:]) s.name = tup[0] expected = self.frame.iloc[i, :].reset_index(drop=True) self._assert_series_equal(s, expected) df = self.klass({ 'floats': np.random.randn(5), 'ints': lrange(5) }, columns=['floats', 'ints']) for tup in df.itertuples(index=False): assert isinstance(tup[1], (int, long)) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[['a', 'a']] assert (list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) # repr with be int/long on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): assert (repr(list(df.itertuples( name=None))) == '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') tup = next(df.itertuples(name='TestName')) if LooseVersion(sys.version) >= LooseVersion('2.7'): assert tup._fields == ('Index', 'a', 'b') assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == 'TestName' df.columns = ['def', 'return'] tup2 = next(df.itertuples(name='TestName')) assert tup2 == (0, 1, 4) if LooseVersion(sys.version) >= LooseVersion('2.7'): assert tup2._fields == ('Index', '_1', '_2') df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert not hasattr(tup3, '_fields') assert isinstance(tup3, tuple)
def test_tar_gz_to_different_filename(): with tm.ensure_clean(filename=".foo") as file: pd.DataFrame( [["1", "2"]], columns=["foo", "bar"], ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False) with gzip.open(file) as uncompressed: with tarfile.TarFile(fileobj=uncompressed) as archive: members = archive.getmembers() assert len(members) == 1 content = archive.extractfile(members[0]).read().decode("utf8") if is_platform_windows(): expected = "foo,bar\r\n1,2\r\n" else: expected = "foo,bar\n1,2\n" assert content == expected
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): if data.dtype.fill_value != 0: pass elif all_arithmetic_operators.strip("_") not in [ "mul", "rmul", "floordiv", "rfloordiv", "pow", "mod", "rmod", ]: mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch") request.node.add_marker(mark) elif is_platform_windows() or not IS64: mark = pytest.mark.xfail(reason="results are int32, expected int64") request.node.add_marker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: pytest.skip( "segfaults on win-64, only when all tests are run") data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO(data), dtype={ 'one': 'u1', 1: 'S1'}, as_recarray=True) self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1')
def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( "segfaults on win-64, only when all tests are run") data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO(data), dtype={ 'one': 'u1', 1: 'S1'}, as_recarray=True) self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1')
def test_floating_array_disallows_float16(request): # GH#44715 arr = np.array([1, 2], dtype=np.float16) mask = np.array([False, False]) msg = "FloatingArray does not support np.float16 dtype" with pytest.raises(TypeError, match=msg): FloatingArray(arr, mask) if np_version_under1p19 or (locale.getlocale()[0] != "en_US" and not is_platform_windows()): # the locale condition may need to be refined; this fails on # the CI in the ZH_CN build mark = pytest.mark.xfail( reason="numpy does not raise on np.dtype('Float16')") request.node.add_marker(mark) with pytest.raises(TypeError, match="data type 'Float16' not understood"): pd.array([1.0, 2.0], dtype="Float16")
def test_numpy_array_equal_object_message(self): if is_platform_windows(): raise nose.SkipTest("windows has incomparable line-endings " "and uses L on the shape") a = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-01')]) b = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]) expected = """numpy array are different numpy array values are different \\(50\\.0 %\\) \\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] \\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(a, b) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(a, b)
def test_close_file_handle_on_invalid_usecols(all_parsers): # GH 45384 parser = all_parsers error = ValueError if parser.engine == "pyarrow": pyarrow = pytest.importorskip("pyarrow") error = pyarrow.lib.ArrowKeyError if is_ci_environment() and (is_platform_windows() or is_platform_mac()): # GH#45547 causes timeouts on windows/mac builds pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22") with tm.ensure_clean("test.csv") as fname: Path(fname).write_text("col1,col2\na,b\n1,2") with tm.assert_produces_warning(False): with pytest.raises(error, match="col3"): parser.read_csv(fname, usecols=["col1", "col2", "col3"]) # unlink fails on windows if file handles still point to it os.unlink(fname)
def test_constructor_bad_file(self): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 # the error raised is different on Windows if is_platform_windows(): msg = "The parameter is incorrect" err = OSError else: msg = "[Errno 22]" err = mmap.error tm.assertRaisesRegexp(err, msg, common.MMapWrapper, non_file) target = open(self.mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
def test_constructor_compound_dtypes(self): # GH 5191 # compound dtypes should raise not-implementederror def f(dtype): data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) f('int64') f('float64') # 10822 # invalid error message on dt inference if not compat.is_platform_windows(): f('M8[ns]')
def test_compact_ints_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( "segfaults on win-64, only when all tests are run") data = ('0,1,0,0\n' '1,1,0,0\n' '0,1,0,1') result = self.read_csv(StringIO(data), delimiter=',', header=None, compact_ints=True, as_recarray=True) ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) self.assertEqual(result.dtype, ex_dtype) result = self.read_csv(StringIO(data), delimiter=',', header=None, as_recarray=True, compact_ints=True, use_unsigned=True) ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) self.assertEqual(result.dtype, ex_dtype)
def test_replace_series(self, how, to_key, from_key): if from_key == 'bool' and how == 'series' and compat.PY3: # doesn't work in PY3, though ...dict_from_bool works fine pytest.skip("doesn't work as in PY3") index = pd.Index([3, 4], name='xxx') obj = pd.Series(self.rep[from_key], index=index, name='yyy') assert obj.dtype == from_key if (from_key.startswith('datetime') and to_key.startswith('datetime')): # tested below return elif from_key in ['datetime64[ns, US/Eastern]', 'datetime64[ns, UTC]']: # tested below return if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) elif how == 'series': replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) if ((from_key == 'float64' and to_key in ('int64')) or (from_key == 'complex128' and to_key in ('int64', 'float64'))): if compat.is_platform_32bit() or compat.is_platform_windows(): pytest.skip("32-bit platform buggy: {0} -> {1}".format( from_key, to_key)) # Expected: do not downcast by replacement exp = pd.Series(self.rep[to_key], index=index, name='yyy', dtype=from_key) else: exp = pd.Series(self.rep[to_key], index=index, name='yyy') assert exp.dtype == to_key tm.assert_series_equal(result, exp)
def test_replace_series(self, how, to_key, from_key): if from_key == "bool" and how == "series": # doesn't work in PY3, though ...dict_from_bool works fine pytest.skip("doesn't work as in PY3") index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): # tested below return elif from_key in ["datetime64[ns, US/Eastern]", "datetime64[ns, UTC]"]: # tested below return if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) if (from_key == "float64" and to_key in ("int64")) or ( from_key == "complex128" and to_key in ("int64", "float64")): if compat.is_platform_32bit() or compat.is_platform_windows(): pytest.skip("32-bit platform buggy: {0} -> {1}".format( from_key, to_key)) # Expected: do not downcast by replacement exp = pd.Series(self.rep[to_key], index=index, name="yyy", dtype=from_key) else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp)
def test_itertuples(self): for i, tup in enumerate(self.frame.itertuples()): s = self.klass._constructor_sliced(tup[1:]) s.name = tup[0] expected = self.frame.iloc[i, :].reset_index(drop=True) self._assert_series_equal(s, expected) df = self.klass({'floats': np.random.randn(5), 'ints': lrange(5)}, columns=['floats', 'ints']) for tup in df.itertuples(index=False): assert isinstance(tup[1], (int, long)) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[['a', 'a']] assert (list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) # repr with be int/long on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): assert (repr(list(df.itertuples(name=None))) == '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') tup = next(df.itertuples(name='TestName')) if sys.version >= LooseVersion('2.7'): assert tup._fields == ('Index', 'a', 'b') assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == 'TestName' df.columns = ['def', 'return'] tup2 = next(df.itertuples(name='TestName')) assert tup2 == (0, 1, 4) if sys.version >= LooseVersion('2.7'): assert tup2._fields == ('Index', '_1', '_2') df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert not hasattr(tup3, '_fields') assert isinstance(tup3, tuple)
def test_constructor_bad_file(self): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 # the error raised is different on Windows if is_platform_windows(): msg = "The parameter is incorrect" err = OSError else: msg = "Invalid argument" err = mmap.error tm.assertRaisesRegexp(err, msg, common.MMapWrapper, non_file) target = open(self.mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
def test_numpy_array_equal_object_message(self): if is_platform_windows(): pytest.skip("windows has incomparable line-endings " "and uses L on the shape") a = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-01')]) b = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]) expected = """numpy array are different numpy array values are different \\(50\\.0 %\\) \\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] \\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" with tm.assert_raises_regex(AssertionError, expected): assert_numpy_array_equal(a, b) with tm.assert_raises_regex(AssertionError, expected): assert_almost_equal(a, b)
def test_replace_series(self, how, to_key, from_key): if from_key == 'bool' and how == 'series' and compat.PY3: # doesn't work in PY3, though ...dict_from_bool works fine pytest.skip("doesn't work as in PY3") index = pd.Index([3, 4], name='xxx') obj = pd.Series(self.rep[from_key], index=index, name='yyy') assert obj.dtype == from_key if (from_key.startswith('datetime') and to_key.startswith('datetime')): # tested below return elif from_key in ['datetime64[ns, US/Eastern]', 'datetime64[ns, UTC]']: # tested below return if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) elif how == 'series': replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) if ((from_key == 'float64' and to_key in ('int64')) or (from_key == 'complex128' and to_key in ('int64', 'float64'))): if compat.is_platform_32bit() or compat.is_platform_windows(): pytest.skip("32-bit platform buggy: {0} -> {1}".format (from_key, to_key)) # Expected: do not downcast by replacement exp = pd.Series(self.rep[to_key], index=index, name='yyy', dtype=from_key) else: exp = pd.Series(self.rep[to_key], index=index, name='yyy') assert exp.dtype == to_key tm.assert_series_equal(result, exp)
def test_itertuples(self, float_frame): for i, tup in enumerate(float_frame.itertuples()): s = self.klass._constructor_sliced(tup[1:]) s.name = tup[0] expected = float_frame.iloc[i, :].reset_index(drop=True) self._assert_series_equal(s, expected) df = self.klass({ "floats": np.random.randn(5), "ints": range(5) }, columns=["floats", "ints"]) for tup in df.itertuples(index=False): assert isinstance(tup[1], int) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[["a", "a"]] assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] # repr with int on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): assert (repr(list(df.itertuples( name=None))) == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]") tup = next(df.itertuples(name="TestName")) assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == "TestName" df.columns = ["def", "return"] tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) assert tup2._fields == ("Index", "_1", "_2") df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple)
def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 # the error raised is different on Windows if is_platform_windows(): msg = "The parameter is incorrect" err = OSError else: msg = "[Errno 22]" err = mmap.error with pytest.raises(err, match=msg): icom.MMapWrapper(non_file) target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" with pytest.raises(ValueError, match=msg): icom.MMapWrapper(target)
def test_numpy_string_dtype_as_recarray(self): data = """\ a,1 aa,2 aaa,3 aaaa,4 aaaaa,5""" if compat.is_platform_windows(): raise nose.SkipTest("segfaults on win-64, only when all tests are run") def _make_reader(**kwds): return TextReader(StringIO(data), delimiter=',', header=None, **kwds) reader = _make_reader(dtype='S4', as_recarray=True) result = reader.read() self.assertEqual(result['0'].dtype, 'S4') ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') self.assertTrue((result['0'] == ex_values).all()) self.assertEqual(result['1'].dtype, 'S4')
def test_constructor_bad_file(self, mmap_file): non_file = StringIO("I am not a file") non_file.fileno = lambda: -1 # the error raised is different on Windows if is_platform_windows(): msg = "The parameter is incorrect" err = OSError else: msg = "[Errno 22]" err = mmap.error with pytest.raises(err, match=msg): icom._MMapWrapper(non_file) target = open(mmap_file) target.close() msg = "I/O operation on closed file" with pytest.raises(ValueError, match=msg): icom._MMapWrapper(target)
def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fixture2): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) from dateutil.tz import tzlocal if is_platform_windows() and tz_aware_fixture2 == tzlocal(): pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10**9], dtype=fill_dtype)[0] # filling datetimetz with datetimetz casts to object, unless tz matches exp_val_for_scalar = fill_value if dtype.tz == fill_dtype.tz: expected_dtype = dtype else: expected_dtype = np.dtype(object) pytest.xfail("fails to cast to object") _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
def test_replace_series(self, how, to_key, from_key): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): # tested below return elif from_key in ["datetime64[ns, US/Eastern]", "datetime64[ns, UTC]"]: # tested below return if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) if (from_key == "float64" and to_key in ("int64")) or ( from_key == "complex128" and to_key in ("int64", "float64")): if not IS64 or is_platform_windows(): pytest.skip(f"32-bit platform buggy: {from_key} -> {to_key}") # Expected: do not downcast by replacement exp = pd.Series(self.rep[to_key], index=index, name="yyy", dtype=from_key) else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp)
def test_encode(self, f): _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') try: with open(f, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() with open(f, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() from_filename = self.read_html(f, encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if '16' in encoding or '32' in encoding: pytest.skip() raise
def test_compact_ints_as_recarray(self): if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") data = ('0,1,0,0\n' '1,1,0,0\n' '0,1,0,1') with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO(data), delimiter=',', header=None, compact_ints=True, as_recarray=True) ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) self.assertEqual(result.dtype, ex_dtype) with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO(data), delimiter=',', header=None, as_recarray=True, compact_ints=True, use_unsigned=True) ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) self.assertEqual(result.dtype, ex_dtype)
def test_constructor_compound_dtypes(self): # GH 5191 # compound dtypes should raise not-implementederror def f(dtype): data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) msg = "compound dtypes are not implemented in the DataFrame constructor" with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) depr_msg = "either all columns will be cast to that dtype, or a TypeError will" with tm.assert_produces_warning(FutureWarning, match=depr_msg): f("int64") with tm.assert_produces_warning(FutureWarning, match=depr_msg): f("float64") # 10822 # invalid error message on dt inference if not compat.is_platform_windows(): f("M8[ns]")
def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fixture2, box): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) boxed, box_dtype = box # read from parametrized fixture from dateutil.tz import tzlocal if is_platform_windows() and tz_aware_fixture2 == tzlocal(): pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") if dtype.tz == fill_dtype.tz and boxed: pytest.xfail("falsely upcasts") if dtype.tz != fill_dtype.tz and not boxed: pytest.xfail("falsely upcasts") # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10**9], dtype=fill_dtype)[0] # filling datetimetz with datetimetz casts to object, unless tz matches exp_val_for_scalar = fill_value if dtype.tz == fill_dtype.tz: expected_dtype = dtype exp_val_for_array = NaT else: expected_dtype = np.dtype(object) exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def test_numpy_string_dtype_as_recarray(self): data = """\ a,1 aa,2 aaa,3 aaaa,4 aaaaa,5""" if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") def _make_reader(**kwds): return TextReader(StringIO(data), delimiter=',', header=None, **kwds) reader = _make_reader(dtype='S4', as_recarray=True) result = reader.read() self.assertEqual(result['0'].dtype, 'S4') ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') self.assertTrue((result['0'] == ex_values).all()) self.assertEqual(result['1'].dtype, 'S4')
def test_intersect(): def _check_correct(a, b, expected): result = a.intersect(b) assert (result.equals(expected)) def _check_length_exc(a, longer): nose.tools.assert_raises(Exception, a.intersect, longer) def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) yindex = BlockIndex(TEST_LENGTH, yloc, ylen) expected = BlockIndex(TEST_LENGTH, eloc, elen) longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) _check_correct(xindex, yindex, expected) _check_correct(xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index()) _check_length_exc(xindex, longer_index) _check_length_exc(xindex.to_int_index(), longer_index.to_int_index()) if compat.is_platform_windows(): raise nose.SkipTest("segfaults on win-64 when all tests are run") check_cases(_check_case)
def _skip_if_no_mpl(): mod = safe_import("matplotlib") if mod: mod.use("Agg", warn=False) else: return True def _skip_if_mpl_1_5(): mod = safe_import("matplotlib") if mod: v = mod.__version__ if LooseVersion(v) > LooseVersion('1.4.3') or str(v)[0] == '0': return True else: mod.use("Agg", warn=False) skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), reason="matplotlib 1.5") skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") skip_if_windows_python_3 = pytest.mark.skipif(is_platform_windows() and PY3, reason=("not used on python3/" "win32"))
import locale import os import pytest from pandas._config.localization import can_set_locale, get_locales, set_locale from pandas.compat import is_platform_windows import pandas as pd _all_locales = get_locales() or [] _current_locale = locale.getlocale() # Don't run any of these tests if we are on Windows or have no locales. pytestmark = pytest.mark.skipif(is_platform_windows() or not _all_locales, reason="Need non-Windows and locales") _skip_if_only_one_locale = pytest.mark.skipif( len(_all_locales) <= 1, reason="Need multiple locales for meaningful test") def test_can_set_locale_valid_set(): # Can set the default locale. assert can_set_locale("") def test_can_set_locale_invalid_set(): # Cannot set an invalid locale. assert not can_set_locale("non-existent_locale")
def pa(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") if is_platform_windows(): pytest.skip("pyarrow-parquet not building on windows") return 'pyarrow'
result = frequencies.infer_freq(vals) assert result == rng.inferred_freq @pytest.mark.parametrize("idx", [ tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10) ]) def test_invalid_index_types(idx): msg = ("(cannot infer freq from a non-convertible)|" "(Check the `freq` attribute instead of using infer_freq)") with pytest.raises(TypeError, match=msg): frequencies.infer_freq(idx) @pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") @pytest.mark.parametrize("idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) def test_invalid_index_types_unicode(idx): # see gh-10822 # # Odd error message on conversions to datetime for unicode. msg = "Unknown string format" with pytest.raises(ValueError, match=msg): frequencies.infer_freq(idx) def test_string_datetime_like_compat(): # see gh-6463
# -*- coding: utf-8 -*- from __future__ import print_function import numpy as np import pandas as pd import pandas.util.testing as tm from pandas.compat import (is_platform_windows, is_platform_32bit) from pandas.core.config import option_context use_32bit_repr = is_platform_windows() or is_platform_32bit() class TestSparseSeriesFormatting(tm.TestCase): @property def dtype_format_for_platform(self): return '' if use_32bit_repr else ', dtype=int32' def test_sparse_max_row(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() result = repr(s) dfm = self.dtype_format_for_platform exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" "4 NaN\ndtype: float64\nBlockIndex\n" "Block locations: array([0, 3]{0})\n" "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp)
from pandas.compat import is_platform_windows, lrange from pandas.compat.numpy import np_array_datetime64_compat import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, MultiIndex from pandas.core.indexes.datetimes import date_range import pandas.util.testing as tm import pandas.io.date_converters as conv import pandas.io.parsers as parsers # constant _DEFAULT_DATETIME = datetime(1, 1, 1) # Strategy for hypothesis if is_platform_windows(): date_strategy = st.datetimes(min_value=datetime(1900, 1, 1)) else: date_strategy = st.datetimes() def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # # Make sure thousands separator and # date parsing do not conflict. parser = all_parsers data = "06-02-2013;13:00;1-000.215" expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2])
from pandas._libs.tslibs.parsing import parse_datetime_string from pandas.compat import is_platform_windows from pandas.compat.numpy import np_array_datetime64_compat import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series import pandas._testing as tm from pandas.core.indexes.datetimes import date_range import pandas.io.date_converters as conv # constant _DEFAULT_DATETIME = datetime(1, 1, 1) # Strategy for hypothesis if is_platform_windows(): date_strategy = st.datetimes(min_value=datetime(1900, 1, 1)) else: date_strategy = st.datetimes() def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # # Make sure thousands separator and # date parsing do not conflict. parser = all_parsers data = "06-02-2013;13:00;1-000.215" expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2])
@pytest.mark.parametrize( "idx", [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)]) def test_invalid_index_types(idx): msg = ("(cannot infer freq from a non-convertible)|" "(Check the `freq` attribute instead of using infer_freq)") with pytest.raises(TypeError, match=msg): frequencies.infer_freq(idx) @pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") @pytest.mark.parametrize( "idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) def test_invalid_index_types_unicode(idx): # see gh-10822 # # Odd error message on conversions to datetime for unicode. msg = "Unknown string format" with pytest.raises(ValueError, match=msg): frequencies.infer_freq(idx) def test_string_datetime_like_compat():
assert isinstance(get_engine('auto'), PyArrowImpl) assert isinstance(get_engine('pyarrow'), PyArrowImpl) assert isinstance(get_engine('fastparquet'), FastParquetImpl) with pd.option_context('io.parquet.engine', 'fastparquet'): assert isinstance(get_engine('auto'), FastParquetImpl) assert isinstance(get_engine('pyarrow'), PyArrowImpl) assert isinstance(get_engine('fastparquet'), FastParquetImpl) with pd.option_context('io.parquet.engine', 'auto'): assert isinstance(get_engine('auto'), PyArrowImpl) assert isinstance(get_engine('pyarrow'), PyArrowImpl) assert isinstance(get_engine('fastparquet'), FastParquetImpl) @pytest.mark.xfail(is_platform_windows() or is_platform_mac(), reason="reading pa metadata failing on Windows/mac", strict=True) def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=fp, columns=['a', 'd']) tm.assert_frame_equal(result, df[['a', 'd']])
assert isinstance(get_engine('auto'), PyArrowImpl) assert isinstance(get_engine('pyarrow'), PyArrowImpl) assert isinstance(get_engine('fastparquet'), FastParquetImpl) with pd.option_context('io.parquet.engine', 'fastparquet'): assert isinstance(get_engine('auto'), FastParquetImpl) assert isinstance(get_engine('pyarrow'), PyArrowImpl) assert isinstance(get_engine('fastparquet'), FastParquetImpl) with pd.option_context('io.parquet.engine', 'auto'): assert isinstance(get_engine('auto'), PyArrowImpl) assert isinstance(get_engine('pyarrow'), PyArrowImpl) assert isinstance(get_engine('fastparquet'), FastParquetImpl) @pytest.mark.xfail(is_platform_windows() or is_platform_mac(), reason="reading pa metadata failing on Windows/mac") def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) result = read_parquet(path, engine=fp, columns=['a', 'd']) tm.assert_frame_equal(result, df[['a', 'd']])
def test_numpy_array_equal_message(self): if is_platform_windows(): raise nose.SkipTest("windows has incomparable line-endings " "and uses L on the shape") expected = """numpy array are different numpy array shapes are different \\[left\\]: \\(2,\\) \\[right\\]: \\(3,\\)""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5])) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5])) # scalar comparison expected = """Expected type """ with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(1, 2) expected = """expected 2\\.00000 but got 1\\.00000, with decimal 5""" with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(1, 2) # array / scalar array comparison expected = """numpy array are different numpy array classes are different \\[left\\]: ndarray \\[right\\]: int""" with assertRaisesRegexp(AssertionError, expected): # numpy_array_equal only accepts np.ndarray assert_numpy_array_equal(np.array([1]), 1) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([1]), 1) # scalar / array comparison expected = """numpy array are different numpy array classes are different \\[left\\]: int \\[right\\]: ndarray""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(1, np.array([1])) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(1, np.array([1])) expected = """numpy array are different numpy array values are different \\(66\\.66667 %\\) \\[left\\]: \\[nan, 2\\.0, 3\\.0\\] \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) expected = """numpy array are different numpy array values are different \\(50\\.0 %\\) \\[left\\]: \\[1, 2\\] \\[right\\]: \\[1, 3\\]""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3])) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([1, 2]), np.array([1, 3])) expected = """numpy array are different numpy array values are different \\(50\\.0 %\\) \\[left\\]: \\[1\\.1, 2\\.000001\\] \\[right\\]: \\[1\\.1, 2.0\\]""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal( np.array([1.1, 2.000001]), np.array([1.1, 2.0])) # must pass assert_almost_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) expected = """numpy array are different numpy array values are different \\(16\\.66667 %\\) \\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]])) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]])) expected = """numpy array are different numpy array values are different \\(25\\.0 %\\) \\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) # allow to overwrite message expected = """Index are different Index shapes are different \\[left\\]: \\(2,\\) \\[right\\]: \\(3,\\)""" with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), obj='Index') with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj='Index')
msg += " satisfying a min_version of {}".format(min_version) return pytest.mark.skipif( not safe_import(package, min_version=min_version), reason=msg )(func) return decorated_func skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), reason="matplotlib 1.5") xfail_if_mpl_2_2 = pytest.mark.xfail(_skip_if_mpl_2_2(), reason="matplotlib 2.2") skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") skip_if_windows_python_3 = pytest.mark.skipif(is_platform_windows() and PY3, reason=("not used on python3/" "win32")) skip_if_has_locale = pytest.mark.skipif(_skip_if_has_locale(), reason="Specific locale is set {lang}" .format(lang=locale.getlocale()[0])) skip_if_not_us_locale = pytest.mark.skipif(_skip_if_not_us_locale(), reason="Specific locale is set " "{lang}".format( lang=locale.getlocale()[0])) skip_if_no_scipy = pytest.mark.skipif(_skip_if_no_scipy(), reason="Missing SciPy requirement") skip_if_no_lzma = pytest.mark.skipif(_skip_if_no_lzma(), reason="need backports.lzma to run")
parametrization mark. """ msg = f"Could not import '{package}'" if min_version: msg += f" satisfying a min_version of {min_version}" return pytest.mark.skipif( not safe_import(package, min_version=min_version), reason=msg ) skip_if_no_mpl = pytest.mark.skipif( _skip_if_no_mpl(), reason="Missing matplotlib dependency" ) skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") skip_if_not_us_locale = pytest.mark.skipif( _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_no_scipy = pytest.mark.skipif( _skip_if_no_scipy(), reason="Missing SciPy requirement" ) skip_if_no_ne = pytest.mark.skipif( not USE_NUMEXPR, reason=f"numexpr enabled->{USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", ) # TODO(pytest#7469): return type, _pytest.mark.structures.MarkDecorator is not public # https://github.com/pytest-dev/pytest/issues/7469 def skip_if_np_lt(ver_str: str, *args, reason: str | None = None):
class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): df = df_full # additional supported types for pyarrow dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels") dti = dti._with_freq(None) # freq doesn't round-trip df["datetime_tz"] = dti df["bool_with_none"] = [True, None, True] check_round_trip(df, pa) def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full # additional supported types for pyarrow df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") check_round_trip( df, pa, expected=df[["string", "int"]], read_kwargs={"columns": ["string", "int"]}, ) def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105 buf_bytes = df_full.to_parquet(engine=pa) assert isinstance(buf_bytes, bytes) buf_stream = BytesIO(buf_bytes) res = read_parquet(buf_stream) tm.assert_frame_equal(df_full, res) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, pa, ValueError, "Duplicate column names found") def test_unsupported(self, pa): # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) self.check_external_error_on_write(df, pa, NotImplementedError) # mixed python objects df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_external_error_on_write(df, pa, pyarrow.ArrowException) def test_categorical(self, pa): # supported in >= 0.7.0 df = pd.DataFrame() df["a"] = pd.Categorical(list("abcdef")) # test for null, out-of-order values, and unobserved category df["b"] = pd.Categorical( ["bar", "foo", "foo", "bar", None, "bar"], dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), ) # test for ordered flag df["c"] = pd.Categorical(["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True) check_round_trip(df, pa) @pytest.mark.xfail( is_platform_windows(), reason="localhost connection rejected", strict=False, ) def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") s3 = s3fs.S3FileSystem(**s3so) kw = {"filesystem": s3} check_round_trip( df_compat, pa, path="pandas-test/pyarrow.parquet", read_kwargs=kw, write_kwargs=kw, ) def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): # GH #19134 s3so = {"storage_options": s3so} check_round_trip( df_compat, pa, path="s3://pandas-test/pyarrow.parquet", read_kwargs=s3so, write_kwargs=s3so, ) @td.skip_if_no("s3fs") # also requires flask @pytest.mark.parametrize( "partition_col", [ ["A"], [], ], ) def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col, s3so): # GH #26388 expected_df = df_compat.copy() # GH #35791 # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 # Previous behaviour was pyarrow partitioned columns become 'category' dtypes # These are added to back of dataframe on read. In new API category dtype is # only used if partition field is string, but this changed again to use # category dtype for all types (not only strings) in pyarrow 2.0.0 if partition_col: partition_col_type = ("int32" if (not pa_version_under1p0) and pa_version_under2p0 else "category") expected_df[partition_col] = expected_df[partition_col].astype( partition_col_type) check_round_trip( df_compat, pa, expected=expected_df, path="s3://pandas-test/parquet_dir", read_kwargs={"storage_options": s3so}, write_kwargs={ "partition_cols": partition_col, "compression": None, "storage_options": s3so, }, check_like=True, repeat=1, ) @td.skip_if_no("pyarrow") def test_read_file_like_obj_support(self, df_compat): buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf) @td.skip_if_no("pyarrow") def test_expand_user(self, df_compat, monkeypatch): monkeypatch.setenv("HOME", "TestingUser") monkeypatch.setenv("USERPROFILE", "TestingUser") with pytest.raises(OSError, match=r".*TestingUser.*"): read_parquet("~/file.parquet") with pytest.raises(OSError, match=r".*TestingUser.*"): df_compat.to_parquet("~/file.parquet") def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) check_partition_names(path, partition_cols) assert read_parquet(path).shape == df.shape def test_partition_cols_string(self, pa, df_full): # GH #27117 partition_cols = "bool" partition_cols_list = [partition_cols] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) check_partition_names(path, partition_cols_list) assert read_parquet(path).shape == df.shape @pytest.mark.parametrize("path_type", [str, pathlib.Path]) def test_partition_cols_pathlib(self, pa, df_compat, path_type): # GH 35902 partition_cols = "B" partition_cols_list = [partition_cols] df = df_compat with tm.ensure_clean_dir() as path_str: path = path_type(path_str) df.to_parquet(path, partition_cols=partition_cols_list) assert read_parquet(path).shape == df.shape def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() check_round_trip(df, pa) def test_write_with_schema(self, pa): import pyarrow df = pd.DataFrame({"x": [0, 1]}) schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())]) out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) @td.skip_if_no("pyarrow") def test_additional_extension_arrays(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol df = pd.DataFrame({ "a": pd.Series([1, 2, 3], dtype="Int64"), "b": pd.Series([1, 2, 3], dtype="UInt32"), "c": pd.Series(["a", None, "c"], dtype="string"), }) check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) @td.skip_if_no("pyarrow", min_version="1.0.0") def test_pyarrow_backed_string_array(self, pa, string_storage): # test ArrowStringArray supported through the __arrow_array__ protocol df = pd.DataFrame( {"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) @td.skip_if_no("pyarrow") def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType df = pd.DataFrame({ # Arrow does not yet support struct in writing to Parquet (ARROW-1644) # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), "d": pd.period_range("2012-01-01", periods=3, freq="D"), }) check_round_trip(df, pa) def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error df = pd.DataFrame( {"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": "2.0"}) def test_timezone_aware_index(self, pa, timezone_aware_date_list): if not pa_version_under2p0: # temporary skip this test until it is properly resolved # https://github.com/pandas-dev/pandas/issues/37286 pytest.skip() idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) # see gh-36004 # compare time(zone) values only, skip their class: # pyarrow always creates fixed offset timezones using pytz.FixedOffset() # even if it was datetime.timezone() originally # # technically they are the same: # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute check_round_trip(df, pa, check_dtype=False) @td.skip_if_no("pyarrow", min_version="1.0.0") def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 df = pd.DataFrame({"a": list(range(0, 3))}) with tm.ensure_clean() as path: df.to_parquet(path, pa) result = read_parquet(path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False) assert len(result) == 1 def test_read_parquet_manager(self, pa, using_array_manager): # ensure that read_parquet honors the pandas.options.mode.data_manager option df = pd.DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"]) with tm.ensure_clean() as path: df.to_parquet(path, pa) result = read_parquet(path, pa) if using_array_manager: assert isinstance(result._mgr, pd.core.internals.ArrayManager) else: assert isinstance(result._mgr, pd.core.internals.BlockManager)
import codecs import locale import os import pytest from pandas._config.localization import can_set_locale, get_locales, set_locale from pandas.compat import is_platform_windows _all_locales = get_locales() or [] _current_locale = locale.getlocale() # Don't run any of these tests if we are on Windows or have no locales. pytestmark = pytest.mark.skipif(is_platform_windows() or not _all_locales, reason="Need non-Windows and locales") _skip_if_only_one_locale = pytest.mark.skipif( len(_all_locales) <= 1, reason="Need multiple locales for meaningful test") def test_can_set_locale_valid_set(): # Can set the default locale. assert can_set_locale("") def test_can_set_locale_invalid_set(): # Cannot set an invalid locale. assert not can_set_locale("non-existent_locale")