def test_deprecated_match(self): # Old match behavior, deprecated (but still default) in 0.13 values = Series(["fooBAD__barBAD", NA, "foo"]) with tm.assert_produces_warning(): result = values.str.match(".*(BAD[_]+).*(BAD)") exp = Series([("BAD__", "BAD"), NA, []]) tm.assert_series_equal(result, exp) # mixed mixed = Series(["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0]) with tm.assert_produces_warning(): rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") xp = [("BAD_", "BAD"), NA, ("BAD_", "BAD"), NA, NA, [], NA, NA, NA] tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u("fooBAD__barBAD"), NA, u("foo")]) with tm.assert_produces_warning(): result = values.str.match(".*(BAD[_]+).*(BAD)") exp = Series([(u("BAD__"), u("BAD")), NA, []]) tm.assert_series_equal(result, exp)
def test_replace(self): values = Series(["fooBAD__barBAD", NA]) result = values.str.replace("BAD[_]*", "") exp = Series(["foobar", NA]) tm.assert_series_equal(result, exp) result = values.str.replace("BAD[_]*", "", n=1) exp = Series(["foobarBAD", NA]) tm.assert_series_equal(result, exp) # mixed mixed = Series(["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]) rs = Series(mixed).str.replace("BAD[_]*", "") xp = ["a", NA, "b", NA, NA, "foo", NA, NA, NA] tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u("fooBAD__barBAD"), NA]) result = values.str.replace("BAD[_]*", "") exp = Series([u("foobar"), NA]) tm.assert_series_equal(result, exp) result = values.str.replace("BAD[_]*", "", n=1) exp = Series([u("foobarBAD"), NA]) tm.assert_series_equal(result, exp) # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp)
def test_split(self): values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) result = values.str.split("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) result = values.str.split("__") tm.assert_series_equal(result, exp) # mixed mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) rs = Series(mixed).str.split("_") xp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u("a_b_c"), u("c_d_e"), NA, u("f_g_h")]) result = values.str.split("_") exp = Series([[u("a"), u("b"), u("c")], [u("c"), u("d"), u("e")], NA, [u("f"), u("g"), u("h")]]) tm.assert_series_equal(result, exp)
def test_endswith(self): values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) result = values.str.endswith("foo") exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] rs = strings.str_endswith(mixed, "f") xp = [False, NA, False, NA, NA, False, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.endswith("f") tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u("om"), NA, u("foo_nom"), u("nom"), u("bar_foo"), NA, u("foo")]) result = values.str.endswith("foo") exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) result = values.str.endswith("foo", na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def test_count(self): values = ['foo', 'foofoo', NA, 'foooofooofommmfoo'] result = strings.str_count(values, 'f[o]+') exp = [1, 2, NA, 4] tm.assert_almost_equal(result, exp) result = Series(values).str.count('f[o]+') tm.assert_isinstance(result, Series) tm.assert_almost_equal(result, exp) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_count(mixed, 'a') xp = [1, NA, 0, NA, NA, 0, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.count('a') tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = [u('foo'), u('foofoo'), NA, u('foooofooofommmfoo')] result = strings.str_count(values, 'f[o]+') exp = [1, 2, NA, 4] tm.assert_almost_equal(result, exp) result = Series(values).str.count('f[o]+') tm.assert_isinstance(result, Series) tm.assert_almost_equal(result, exp)
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): """ internal. pprinter for iterables. you should probably use pprint_thing() rather then calling this directly. """ fmt = u("{{{things}}}") pairs = [] pfmt = u("{key}: {val}") if max_seq_items is False: nitems = len(seq) else: nitems = max_seq_items or get_option("max_seq_items") or len(seq) for k, v in list(seq.items())[:nitems]: pairs.append( pfmt.format( key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))) if nitems < len(seq): return fmt.format(things=", ".join(pairs) + ", ...") else: return fmt.format(things=", ".join(pairs))
def test_deprecated_match(self): # Old match behavior, deprecated (but still default) in 0.13 values = Series(['fooBAD__barBAD', NA, 'foo']) with tm.assert_produces_warning(): result = values.str.match('.*(BAD[_]+).*(BAD)') exp = Series([('BAD__', 'BAD'), NA, []]) tm.assert_series_equal(result, exp) # mixed mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) with tm.assert_produces_warning(): rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA] tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) with tm.assert_produces_warning(): result = values.str.match('.*(BAD[_]+).*(BAD)') exp = Series([(u('BAD__'), u('BAD')), NA, []]) tm.assert_series_equal(result, exp)
def test_match(self): # New match behavior introduced in 0.13 values = Series(['fooBAD__barBAD', NA, 'foo']) with tm.assert_produces_warning(): result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) # If no groups, use new behavior even when as_indexer is False. # (Old behavior is pretty much useless in this case.) values = Series(['fooBAD__barBAD', NA, 'foo']) result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False) exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) # mixed mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), 'foo', None, 1, 2.]) with tm.assert_produces_warning(): rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) xp = [True, NA, True, NA, NA, False, NA, NA, NA] tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('fooBAD__barBAD'), NA, u('foo')]) with tm.assert_produces_warning(): result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) exp = Series([True, NA, False]) tm.assert_series_equal(result, exp)
def test_endswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_endswith(mixed, 'f') xp = [False, NA, False, NA, NA, False, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.endswith('f') tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, u('foo')]) result = values.str.endswith('foo') exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) result = values.str.endswith('foo', na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def test_astype_unicode(self): # GH7758 # a bit of magic is required to set default encoding encoding to utf-8 digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series([u('データーサイエンス、お前はもう死んでいる')]), ] former_encoding = None if not compat.PY3: # in python we can force the default encoding for this test former_encoding = sys.getdefaultencoding() reload(sys) # noqa sys.setdefaultencoding("utf-8") if sys.getdefaultencoding() == "utf-8": test_series.append(Series([u('野菜食べないとやばい').encode("utf-8")])) for s in test_series: res = s.astype("unicode") expec = s.map(compat.text_type) assert_series_equal(res, expec) # restore the former encoding if former_encoding is not None and former_encoding != "utf-8": reload(sys) # noqa sys.setdefaultencoding(former_encoding)
def testArrayNumpyLabelled(self): input = {'a': []} output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) self.assertTrue((np.empty((1, 0)) == output[0]).all()) self.assertTrue((np.array(['a']) == output[1]).all()) self.assertTrue(output[2] is None) input = [{'a': 42}] output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) self.assertTrue((np.array([42]) == output[0]).all()) self.assertTrue(output[1] is None) self.assertTrue((np.array([u('a')]) == output[2]).all()) # py3 is non-determinstic on the ordering...... if not compat.PY3: input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) self.assertTrue((expectedvals == output[0]).all()) self.assertTrue(output[1] is None) self.assertTrue((np.array([u('a'), 'b']) == output[2]).all()) input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) self.assertTrue((expectedvals == output[0]).all()) self.assertTrue((np.array(['1','2','3']) == output[1]).all()) self.assertTrue((np.array(['a', 'b']) == output[2]).all())
def load_reduce(self): stack = self.stack args = stack.pop() func = stack[-1] if type(args[0]) is type: n = args[0].__name__ if n == u('DeprecatedSeries') or n == u('DeprecatedTimeSeries'): stack[-1] = object.__new__(Series) return elif n == u('DeprecatedSparseSeries') or n == u('DeprecatedSparseTimeSeries'): stack[-1] = object.__new__(SparseSeries) return try: value = func(*args) except: # try to reencode the arguments if self.encoding is not None: args = tuple([ arg.encode(self.encoding) if isinstance(arg, string_types) else arg for arg in args ]) try: stack[-1] = func(*args) return except: pass if self.is_verbose: print(sys.exc_info()) print(func, args) raise stack[-1] = value
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): """ internal. pprinter for iterables. you should probably use pprint_thing() rather then calling this directly. bounds length of printed sequence, depending on options """ if isinstance(seq, set): fmt = u("{{{body}}}") else: fmt = u("[{body}]") if hasattr(seq, '__setitem__') else u("({body})") if max_seq_items is False: nitems = len(seq) else: nitems = max_seq_items or get_option("max_seq_items") or len(seq) s = iter(seq) r = [] for i in range(min(nitems, len(seq))): # handle sets, no slicing r.append(pprint_thing( next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) body = ", ".join(r) if nitems < len(seq): body += ", ..." elif isinstance(seq, tuple) and len(seq) == 1: body += ',' return fmt.format(body=body)
def test_to_latex_escape(self): a = 'a' b = 'b' test_dict = {u('co^l1'): {a: "a", b: "b"}, u('co$e^x$'): {a: "a", b: "b"}} unescaped_result = DataFrame(test_dict).to_latex(escape=False) escaped_result = DataFrame(test_dict).to_latex( ) # default: escape=True unescaped_expected = r'''\begin{tabular}{lll} \toprule {} & co$e^x$ & co^l1 \\ \midrule a & a & a \\ b & b & b \\ \bottomrule \end{tabular} ''' escaped_expected = r'''\begin{tabular}{lll} \toprule {} & co\$e\textasciicircumx\$ & co\textasciicircuml1 \\ \midrule a & a & a \\ b & b & b \\ \bottomrule \end{tabular} ''' assert unescaped_result == unescaped_expected assert escaped_result == escaped_expected
def test_array_numpy_labelled(self): labelled_input = {"a": []} output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() assert (np.array(["a"]) == output[1]).all() assert output[2] is None labelled_input = [{"a": 42}] output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) assert (np.array([u("a")]) == output[2]).all() assert (np.array([42]) == output[0]).all() assert output[1] is None # see gh-10837: write out the dump explicitly # so there is no dependency on iteration order input_dumps = ('[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' '{"a": 2.4, "b": 78}]') output = ujson.loads(input_dumps, numpy=True, labelled=True) expected_vals = np.array( [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() assert output[1] is None assert (np.array([u("a"), "b"]) == output[2]).all() input_dumps = ('{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' '"3": {"a": 2.4, "b": 78}}') output = ujson.loads(input_dumps, numpy=True, labelled=True) expected_vals = np.array( [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() assert (np.array(["1", "2", "3"]) == output[1]).all() assert (np.array(["a", "b"]) == output[2]).all()
def test_to_html_unicode(self, datapath): df = DataFrame({u('\u03c3'): np.arange(10.)}) expected = expected_html(datapath, 'unicode_1') assert df.to_html() == expected df = DataFrame({'A': [u('\u03c3')]}) expected = expected_html(datapath, 'unicode_2') assert df.to_html() == expected
def test_to_html_unicode(self): df = DataFrame({u('\u03c3'): np.arange(10.)}) expected = u'<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>\u03c3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4.0</td>\n </tr>\n <tr>\n <th>5</th>\n <td>5.0</td>\n </tr>\n <tr>\n <th>6</th>\n <td>6.0</td>\n </tr>\n <tr>\n <th>7</th>\n <td>7.0</td>\n </tr>\n <tr>\n <th>8</th>\n <td>8.0</td>\n </tr>\n <tr>\n <th>9</th>\n <td>9.0</td>\n </tr>\n </tbody>\n</table>' # noqa assert df.to_html() == expected df = DataFrame({'A': [u('\u03c3')]}) expected = u'<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>A</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>\u03c3</td>\n </tr>\n </tbody>\n</table>' # noqa assert df.to_html() == expected
def _build_option_description(k): """ Builds a formatted description of a registered option and prints it """ o = _get_registered_option(k) d = _get_deprecated_option(k) s = u('{k} ').format(k=k) if o.doc: s += '\n'.join(o.doc.strip().split('\n')) else: s += 'No description available.' if o: s += (u('\n [default: {default}] [currently: {current}]') .format(default=o.defval, current=_get_option(k, True))) if d: s += u('\n (Deprecated') s += (u(', use `{rkey}` instead.') .format(rkey=d.rkey if d.rkey else '')) s += u(')') s += '\n\n' return s
def test_count(self): values = ["foo", "foofoo", NA, "foooofooofommmfoo"] result = strings.str_count(values, "f[o]+") exp = [1, 2, NA, 4] tm.assert_almost_equal(result, exp) result = Series(values).str.count("f[o]+") tm.assert_isinstance(result, Series) tm.assert_almost_equal(result, exp) # mixed mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] rs = strings.str_count(mixed, "a") xp = [1, NA, 0, NA, NA, 0, NA, NA, NA] tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.count("a") tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = [u("foo"), u("foofoo"), NA, u("foooofooofommmfoo")] result = strings.str_count(values, "f[o]+") exp = [1, 2, NA, 4] tm.assert_almost_equal(result, exp) result = Series(values).str.count("f[o]+") tm.assert_isinstance(result, Series) tm.assert_almost_equal(result, exp)
def axis_pretty(a): v = getattr(self, a) if len(v) > 0: return u('%s axis: %s to %s') % (a.capitalize(), com.pprint_thing(v[0]), com.pprint_thing(v[-1])) else: return u('%s axis: None') % a.capitalize()
def test_is_recompilable(): passes = (r"a", u("x"), r"asdf", re.compile("adsf"), u(r"\u2233\s*"), re.compile(r"")) fails = 1, [], object() for p in passes: assert com.is_re_compilable(p) for f in fails: assert not com.is_re_compilable(f)
def test_to_csv_unicode_index(self): buf = StringIO() s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) s.to_csv(buf, encoding="UTF-8") buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") assert_series_equal(s, s2)
def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): prefix = u("file:///") else: prefix = u("file://") fname = prefix + compat.text_type(self.csv1) self.read_csv(fname, index_col=0, parse_dates=True)
def _format_native_types(self, na_rep=u('NaT'), **kwargs): values = np.array(list(self), dtype=object) mask = isnull(self.values) values[mask] = na_rep imask = ~mask values[imask] = np.array([u('%s') % dt for dt in values[imask]]) return values
def test_wdi_download(self): raise nose.SkipTest expected = {'GDPPCKN': {(u('United States'), u('2003')): u('40800.0735367688'), (u('Canada'), u('2004')): u('37857.1261134552'), (u('United States'), u('2005')): u('42714.8594790102'), (u('Canada'), u('2003')): u('37081.4575704003'), (u('United States'), u('2004')): u('41826.1728310667'), (u('Mexico'), u('2003')): u('72720.0691255285'), (u('Mexico'), u('2004')): u('74751.6003347038'), (u('Mexico'), u('2005')): u('76200.2154469437'), (u('Canada'), u('2005')): u('38617.4563629611')}, 'GDPPCKD': {(u('United States'), u('2003')): u('40800.0735367688'), (u('Canada'), u('2004')): u('34397.055116118'), (u('United States'), u('2005')): u('42714.8594790102'), (u('Canada'), u('2003')): u('33692.2812368928'), (u('United States'), u('2004')): u('41826.1728310667'), (u('Mexico'), u('2003')): u('7608.43848670658'), (u('Mexico'), u('2004')): u('7820.99026814334'), (u('Mexico'), u('2005')): u('7972.55364129367'), (u('Canada'), u('2005')): u('35087.8925933298')}} expected = pandas.DataFrame(expected) result = download(country=['CA', 'MX', 'US', 'junk'], indicator=['GDPPCKD', 'GDPPCKN', 'junk'], start=2003, end=2005) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
def test_encode_decode(self): base = Series([u('a'), u('b'), u('a\xe4')]) series = base.str.encode('utf-8') f = lambda x: x.decode('utf-8') result = series.str.decode('utf-8') exp = series.map(f) tm.assert_series_equal(result, exp)
def test_encode_decode(self): base = Series([u("a"), u("b"), u("a\xe4")]) series = base.str.encode("utf-8") f = lambda x: x.decode("utf-8") result = series.str.decode("utf-8") exp = series.map(f) tm.assert_series_equal(result, exp)
def test_repr_should_return_str(self): # https://docs.python.org/3/reference/datamodel.html#object.__repr__ # ...The return value must be a string object. # (str on py2.x, str (unicode) on py3) data = [8, 5, 3, 5] index1 = [u("\u03c3"), u("\u03c4"), u("\u03c5"), u("\u03c6")] df = Series(data, index=index1) assert type(df.__repr__() == str) # both py2 / 3
def setup_method(self, _): self.lst = [3, 5, 7, -2] self.klass = FrozenNDArray with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) self.container = FrozenNDArray(self.lst) self.unicode_container = FrozenNDArray( [u("\u05d0"), u("\u05d1"), "c"])
def test_is_recompilable(): passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), re.compile(r'')) fails = 1, [], object() for p in passes: assert com.is_re_compilable(p) for f in fails: assert not com.is_re_compilable(f)
def test_filter(self): # Items filtered = self.frame.filter(['A', 'B', 'E']) assert len(filtered.columns) == 2 assert 'E' not in filtered filtered = self.frame.filter(['A', 'B', 'E'], axis='columns') assert len(filtered.columns) == 2 assert 'E' not in filtered # Other axis idx = self.frame.index[0:4] filtered = self.frame.filter(idx, axis='index') expected = self.frame.reindex(index=idx) tm.assert_frame_equal(filtered, expected) # like fcopy = self.frame.copy() fcopy['AA'] = 1 filtered = fcopy.filter(like='A') assert len(filtered.columns) == 2 assert 'AA' in filtered # like with ints in column names df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) filtered = df.filter(like='_') assert len(filtered.columns) == 2 # regex with ints in column names # from PR #10384 df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C']) expected = DataFrame(0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object)) filtered = df.filter(regex='^[0-9]+$') tm.assert_frame_equal(filtered, expected) expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1']) # shouldn't remove anything filtered = expected.filter(regex='^[0-9]+$') tm.assert_frame_equal(filtered, expected) # pass in None with tm.assert_raises_regex(TypeError, 'Must pass'): self.frame.filter() with tm.assert_raises_regex(TypeError, 'Must pass'): self.frame.filter(items=None) with tm.assert_raises_regex(TypeError, 'Must pass'): self.frame.filter(axis=1) # test mutually exclusive arguments with tm.assert_raises_regex(TypeError, 'mutually exclusive'): self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') with tm.assert_raises_regex(TypeError, 'mutually exclusive'): self.frame.filter(items=['one', 'three'], regex='e$', axis=1) with tm.assert_raises_regex(TypeError, 'mutually exclusive'): self.frame.filter(items=['one', 'three'], regex='e$') with tm.assert_raises_regex(TypeError, 'mutually exclusive'): self.frame.filter(items=['one', 'three'], like='bbi', axis=0) with tm.assert_raises_regex(TypeError, 'mutually exclusive'): self.frame.filter(items=['one', 'three'], like='bbi') # objects filtered = self.mixed_frame.filter(like='foo') assert 'foo' in filtered # unicode columns, won't ascii-encode df = self.frame.rename(columns={'B': u('\u2202')}) filtered = df.filter(like='C') assert 'C' in filtered
def test_double_long_numbers(self, long_number): sut = {u("a"): long_number} encoded = ujson.encode(sut, double_precision=15) decoded = ujson.decode(encoded) assert sut == decoded
def _u(x): return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x
class TestUltraJSONTests(object): @pytest.mark.skipif(compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865") def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.encode(sut, double_precision=15) decoded = ujson.decode(encoded) assert decoded == 1337.1337 sut = decimal.Decimal("0.95") encoded = ujson.encode(sut, double_precision=1) assert encoded == "1.0" decoded = ujson.decode(encoded) assert decoded == 1.0 sut = decimal.Decimal("0.94") encoded = ujson.encode(sut, double_precision=1) assert encoded == "0.9" decoded = ujson.decode(encoded) assert decoded == 0.9 sut = decimal.Decimal("1.95") encoded = ujson.encode(sut, double_precision=1) assert encoded == "2.0" decoded = ujson.decode(encoded) assert decoded == 2.0 sut = decimal.Decimal("-1.95") encoded = ujson.encode(sut, double_precision=1) assert encoded == "-2.0" decoded = ujson.decode(encoded) assert decoded == -2.0 sut = decimal.Decimal("0.995") encoded = ujson.encode(sut, double_precision=2) assert encoded == "1.0" decoded = ujson.decode(encoded) assert decoded == 1.0 sut = decimal.Decimal("0.9995") encoded = ujson.encode(sut, double_precision=3) assert encoded == "1.0" decoded = ujson.decode(encoded) assert decoded == 1.0 sut = decimal.Decimal("0.99999999999999944") encoded = ujson.encode(sut, double_precision=15) assert encoded == "1.0" decoded = ujson.decode(encoded) assert decoded == 1.0 @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): string_input = "A string \\ / \b \f \n \r \t </script> &" not_html_encoded = ('"A string \\\\ \\/ \\b \\f \\n ' '\\r \\t <\\/script> &"') html_encoded = ('"A string \\\\ \\/ \\b \\f \\n \\r \\t ' '\\u003c\\/script\\u003e \\u0026"') def helper(expected_output, **encode_kwargs): output = ujson.encode(string_input, ensure_ascii=ensure_ascii, **encode_kwargs) assert output == expected_output assert string_input == json.loads(output) assert string_input == ujson.decode(output) # Default behavior assumes encode_html_chars=False. helper(not_html_encoded) # Make sure explicit encode_html_chars=False works. helper(not_html_encoded, encode_html_chars=False) # Make sure explicit encode_html_chars=True does the encoding. helper(html_encoded, encode_html_chars=True) @pytest.mark.parametrize( "long_number", [-4342969734183514, -12345678901234.56789012, -528656961.4399388]) def test_double_long_numbers(self, long_number): sut = {u("a"): long_number} encoded = ujson.encode(sut, double_precision=15) decoded = ujson.decode(encoded) assert sut == decoded def test_encode_non_c_locale(self): lc_category = locale.LC_NUMERIC # We just need one of these locales to work. for new_locale in ("it_IT.UTF-8", "Italian_Italy"): if tm.can_set_locale(new_locale, lc_category): with tm.set_locale(new_locale, lc_category): assert ujson.loads(ujson.dumps(4.78e60)) == 4.78e60 assert ujson.loads("4.78", precise_float=True) == 4.78 break def test_decimal_decode_test_precise(self): sut = {u("a"): 4.56} encoded = ujson.encode(sut) decoded = ujson.decode(encoded, precise_float=True) assert sut == decoded @pytest.mark.skipif(compat.is_platform_windows() and not compat.PY3, reason="buggy on win-64 for py2") def test_encode_double_tiny_exponential(self): num = 1e-40 assert num == ujson.decode(ujson.encode(num)) num = 1e-100 assert num == ujson.decode(ujson.encode(num)) num = -1e-45 assert num == ujson.decode(ujson.encode(num)) num = -1e-145 assert np.allclose(num, ujson.decode(ujson.encode(num))) @pytest.mark.parametrize("unicode_key", [u("key1"), u("بن")]) def test_encode_dict_with_unicode_keys(self, unicode_key): unicode_dict = {unicode_key: u("value1")} assert unicode_dict == ujson.decode(ujson.encode(unicode_dict)) @pytest.mark.parametrize( "double_input", [ math.pi, -math.pi # Should work with negatives too. ]) def test_encode_double_conversion(self, double_input): output = ujson.encode(double_input) assert round(double_input, 5) == round(json.loads(output), 5) assert round(double_input, 5) == round(ujson.decode(output), 5) def test_encode_with_decimal(self): decimal_input = 1.0 output = ujson.encode(decimal_input) assert output == "1.0" def test_encode_array_of_nested_arrays(self): nested_input = [[[[]]]] * 20 output = ujson.encode(nested_input) assert nested_input == json.loads(output) assert nested_input == ujson.decode(output) nested_input = np.array(nested_input) tm.assert_numpy_array_equal( nested_input, ujson.decode(output, numpy=True, dtype=nested_input.dtype)) def test_encode_array_of_doubles(self): doubles_input = [31337.31337, 31337.31337, 31337.31337, 31337.31337 ] * 10 output = ujson.encode(doubles_input) assert doubles_input == json.loads(output) assert doubles_input == ujson.decode(output) tm.assert_numpy_array_equal(np.array(doubles_input), ujson.decode(output, numpy=True)) def test_double_precision(self): double_input = 30.012345678901234 output = ujson.encode(double_input, double_precision=15) assert double_input == json.loads(output) assert double_input == ujson.decode(output) for double_precision in (3, 9): output = ujson.encode(double_input, double_precision=double_precision) rounded_input = round(double_input, double_precision) assert rounded_input == json.loads(output) assert rounded_input == ujson.decode(output) @pytest.mark.parametrize("invalid_val", [20, -1, "9", None]) def test_invalid_double_precision(self, invalid_val): double_input = 30.12345678901234567890 expected_exception = (ValueError if isinstance(invalid_val, int) else TypeError) with pytest.raises(expected_exception): ujson.encode(double_input, double_precision=invalid_val) def test_encode_string_conversion2(self): string_input = "A string \\ / \b \f \n \r \t" output = ujson.encode(string_input) assert string_input == json.loads(output) assert string_input == ujson.decode(output) assert output == '"A string \\\\ \\/ \\b \\f \\n \\r \\t"' @pytest.mark.parametrize( "unicode_input", ["Räksmörgås اسامة بن محمد بن عوض بن لادن", "\xe6\x97\xa5\xd1\x88"]) def test_encode_unicode_conversion(self, unicode_input): enc = ujson.encode(unicode_input) dec = ujson.decode(enc) assert enc == json_unicode(unicode_input) assert dec == json.loads(enc) def test_encode_control_escaping(self): escaped_input = "\x19" enc = ujson.encode(escaped_input) dec = ujson.decode(enc) assert escaped_input == dec assert enc == json_unicode(escaped_input) def test_encode_unicode_surrogate_pair(self): surrogate_input = "\xf0\x90\x8d\x86" enc = ujson.encode(surrogate_input) dec = ujson.decode(enc) assert enc == json_unicode(surrogate_input) assert dec == json.loads(enc) def test_encode_unicode_4bytes_utf8(self): four_bytes_input = "\xf0\x91\x80\xb0TRAILINGNORMAL" enc = ujson.encode(four_bytes_input) dec = ujson.decode(enc) assert enc == json_unicode(four_bytes_input) assert dec == json.loads(enc) def test_encode_unicode_4bytes_utf8highest(self): four_bytes_input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" enc = ujson.encode(four_bytes_input) dec = ujson.decode(enc) assert enc == json_unicode(four_bytes_input) assert dec == json.loads(enc) def test_encode_array_in_array(self): arr_in_arr_input = [[[[]]]] output = ujson.encode(arr_in_arr_input) assert arr_in_arr_input == json.loads(output) assert output == json.dumps(arr_in_arr_input) assert arr_in_arr_input == ujson.decode(output) tm.assert_numpy_array_equal(np.array(arr_in_arr_input), ujson.decode(output, numpy=True)) @pytest.mark.parametrize( "num_input", [ 31337, -31337, # Negative number. -9223372036854775808 # Large negative number. ]) def test_encode_num_conversion(self, num_input): output = ujson.encode(num_input) assert num_input == json.loads(output) assert output == json.dumps(num_input) assert num_input == ujson.decode(output) def test_encode_list_conversion(self): list_input = [1, 2, 3, 4] output = ujson.encode(list_input) assert list_input == json.loads(output) assert list_input == ujson.decode(output) tm.assert_numpy_array_equal(np.array(list_input), ujson.decode(output, numpy=True)) def test_encode_dict_conversion(self): dict_input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4} output = ujson.encode(dict_input) assert dict_input == json.loads(output) assert dict_input == ujson.decode(output) @pytest.mark.parametrize("builtin_value", [None, True, False]) def test_encode_builtin_values_conversion(self, builtin_value): output = ujson.encode(builtin_value) assert builtin_value == json.loads(output) assert output == json.dumps(builtin_value) assert builtin_value == ujson.decode(output) def test_encode_datetime_conversion(self): datetime_input = datetime.datetime.fromtimestamp(time.time()) output = ujson.encode(datetime_input, date_unit="s") expected = calendar.timegm(datetime_input.utctimetuple()) assert int(expected) == json.loads(output) assert int(expected) == ujson.decode(output) def test_encode_date_conversion(self): date_input = datetime.date.fromtimestamp(time.time()) output = ujson.encode(date_input, date_unit="s") tup = (date_input.year, date_input.month, date_input.day, 0, 0, 0) expected = calendar.timegm(tup) assert int(expected) == json.loads(output) assert int(expected) == ujson.decode(output) @pytest.mark.parametrize("test", [ datetime.time(), datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243), ]) def test_encode_time_conversion_basic(self, test): output = ujson.encode(test) expected = '"{iso}"'.format(iso=test.isoformat()) assert expected == output def test_encode_time_conversion_pytz(self): # see gh-11473: to_json segfaults with timezone-aware datetimes test = datetime.time(10, 12, 15, 343243, pytz.utc) output = ujson.encode(test) expected = '"{iso}"'.format(iso=test.isoformat()) assert expected == output def test_encode_time_conversion_dateutil(self): # see gh-11473: to_json segfaults with timezone-aware datetimes test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc()) output = ujson.encode(test) expected = '"{iso}"'.format(iso=test.isoformat()) assert expected == output @pytest.mark.parametrize( "decoded_input", [NaT, np.datetime64("NaT"), np.nan, np.inf, -np.inf]) def test_encode_as_null(self, decoded_input): assert ujson.encode(decoded_input) == "null", "Expected null" def test_datetime_units(self): val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val) roundtrip = ujson.decode(ujson.encode(val, date_unit='s')) assert roundtrip == stamp.value // 10**9 roundtrip = ujson.decode(ujson.encode(val, date_unit='ms')) assert roundtrip == stamp.value // 10**6 roundtrip = ujson.decode(ujson.encode(val, date_unit='us')) assert roundtrip == stamp.value // 10**3 roundtrip = ujson.decode(ujson.encode(val, date_unit='ns')) assert roundtrip == stamp.value pytest.raises(ValueError, ujson.encode, val, date_unit='foo') def test_encode_to_utf8(self): unencoded = "\xe6\x97\xa5\xd1\x88" enc = ujson.encode(unencoded, ensure_ascii=False) dec = ujson.decode(enc) assert enc == json_unicode(unencoded, ensure_ascii=False) assert dec == json.loads(enc) def test_decode_from_unicode(self): unicode_input = u("{\"obj\": 31337}") dec1 = ujson.decode(unicode_input) dec2 = ujson.decode(str(unicode_input)) assert dec1 == dec2 def test_encode_recursion_max(self): # 8 is the max recursion depth class O2(object): member = 0 pass class O1(object): member = 0 pass decoded_input = O1() decoded_input.member = O2() decoded_input.member.member = decoded_input with pytest.raises(OverflowError): ujson.encode(decoded_input) def test_decode_jibberish(self): jibberish = "fdsa sda v9sa fdsa" with pytest.raises(ValueError): ujson.decode(jibberish) @pytest.mark.parametrize( "broken_json", [ "[", # Broken array start. "{", # Broken object start. "]", # Broken array end. "}", # Broken object end. ]) def test_decode_broken_json(self, broken_json): with pytest.raises(ValueError): ujson.decode(broken_json) @pytest.mark.parametrize("too_big_char", [ "[", "{", ]) def test_decode_depth_too_big(self, too_big_char): with pytest.raises(ValueError): ujson.decode(too_big_char * (1024 * 1024)) @pytest.mark.parametrize( "bad_string", [ "\"TESTING", # Unterminated. "\"TESTING\\\"", # Unterminated escape. "tru", # Broken True. "fa", # Broken False. "n", # Broken None. ]) def test_decode_bad_string(self, bad_string): with pytest.raises(ValueError): ujson.decode(bad_string) @pytest.mark.parametrize("broken_json", [ '{{1337:""}}', '{{"key":"}', '[[[true', ]) def test_decode_broken_json_leak(self, broken_json): for _ in range(1000): with pytest.raises(ValueError): ujson.decode(broken_json) @pytest.mark.parametrize( "invalid_dict", [ "{{{{31337}}}}", # No key. "{{{{\"key\":}}}}", # No value. "{{{{\"key\"}}}}", # No colon or value. ]) def test_decode_invalid_dict(self, invalid_dict): with pytest.raises(ValueError): ujson.decode(invalid_dict) @pytest.mark.parametrize( "numeric_int_as_str", [ "31337", "-31337" # Should work with negatives. ]) def test_decode_numeric_int(self, numeric_int_as_str): assert int(numeric_int_as_str) == ujson.decode(numeric_int_as_str) @pytest.mark.skipif(compat.PY3, reason="only PY2") def test_encode_unicode_4bytes_utf8_fail(self): with pytest.raises(OverflowError): ujson.encode("\xfd\xbf\xbf\xbf\xbf\xbf") def test_encode_null_character(self): wrapped_input = "31337 \x00 1337" output = ujson.encode(wrapped_input) assert wrapped_input == json.loads(output) assert output == json.dumps(wrapped_input) assert wrapped_input == ujson.decode(output) alone_input = "\x00" output = ujson.encode(alone_input) assert alone_input == json.loads(output) assert output == json.dumps(alone_input) assert alone_input == ujson.decode(output) assert '" \\u0000\\r\\n "' == ujson.dumps(u(" \u0000\r\n ")) def test_decode_null_character(self): wrapped_input = "\"31337 \\u0000 31337\"" assert ujson.decode(wrapped_input) == json.loads(wrapped_input) def test_encode_list_long_conversion(self): long_input = [ 9223372036854775807, 9223372036854775807, 9223372036854775807, 9223372036854775807, 9223372036854775807, 9223372036854775807 ] output = ujson.encode(long_input) assert long_input == json.loads(output) assert long_input == ujson.decode(output) tm.assert_numpy_array_equal( np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64)) def test_encode_long_conversion(self): long_input = 9223372036854775807 output = ujson.encode(long_input) assert long_input == json.loads(output) assert output == json.dumps(long_input) assert long_input == ujson.decode(output) @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]) def test_decode_numeric_int_exp(self, int_exp): assert ujson.decode(int_exp) == json.loads(int_exp) def test_dump_to_file(self): f = StringIO() ujson.dump([1, 2, 3], f) assert "[1,2,3]" == f.getvalue() def test_dump_to_file_like(self): class FileLike(object): def __init__(self): self.bytes = '' def write(self, data_bytes): self.bytes += data_bytes f = FileLike() ujson.dump([1, 2, 3], f) assert "[1,2,3]" == f.bytes def test_dump_file_args_error(self): with pytest.raises(TypeError): ujson.dump([], "") def test_load_file(self): data = "[1,2,3,4]" exp_data = [1, 2, 3, 4] f = StringIO(data) assert exp_data == ujson.load(f) f = StringIO(data) tm.assert_numpy_array_equal(np.array(exp_data), ujson.load(f, numpy=True)) def test_load_file_like(self): class FileLike(object): def read(self): try: self.end except AttributeError: self.end = True return "[1,2,3,4]" exp_data = [1, 2, 3, 4] f = FileLike() assert exp_data == ujson.load(f) f = FileLike() tm.assert_numpy_array_equal(np.array(exp_data), ujson.load(f, numpy=True)) def test_load_file_args_error(self): with pytest.raises(TypeError): ujson.load("[]") def test_version(self): assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ "ujson.__version__ must be a string like '1.4.0'" def test_encode_numeric_overflow(self): with pytest.raises(OverflowError): ujson.encode(12839128391289382193812939) def test_encode_numeric_overflow_nested(self): class Nested(object): x = 12839128391289382193812939 for _ in range(0, 100): with pytest.raises(OverflowError): ujson.encode(Nested()) @pytest.mark.parametrize("val", [3590016419, 2**31, 2**32, (2**32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the # sign bit set (2**31 <= x < 2**32) are decoded properly. doc = '{{"id": {val}}}'.format(val=val) assert ujson.decode(doc)["id"] == val def test_encode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): base = '\u00e5'.encode("utf-8") if compat.PY3 else "\xc3\xa5" escape_input = base * 1024 * 1024 * 2 ujson.encode(escape_input) def test_decode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): base = '\u00e5'.encode("utf-8") if compat.PY3 else "\xc3\xa5" quote = compat.str_to_bytes("\"") escape_input = quote + (base * 1024 * 1024 * 2) + quote ujson.decode(escape_input) def test_to_dict(self): d = {u("key"): 31337} class DictTest(object): def toDict(self): return d o = DictTest() output = ujson.encode(o) dec = ujson.decode(output) assert dec == d def test_default_handler(self): class _TestObject(object): def __init__(self, val): self.val = val @property def recursive_attr(self): return _TestObject("recursive_attr") def __str__(self): return str(self.val) pytest.raises(OverflowError, ujson.encode, _TestObject("foo")) assert '"foo"' == ujson.encode(_TestObject("foo"), default_handler=str) def my_handler(_): return "foobar" assert '"foobar"' == ujson.encode(_TestObject("foo"), default_handler=my_handler) def my_handler_raises(_): raise TypeError("I raise for anything") with pytest.raises(TypeError, match="I raise for anything"): ujson.encode(_TestObject("foo"), default_handler=my_handler_raises) def my_int_handler(_): return 42 assert ujson.decode( ujson.encode(_TestObject("foo"), default_handler=my_int_handler)) == 42 def my_obj_handler(_): return datetime.datetime(2013, 2, 3) assert (ujson.decode(ujson.encode(datetime.datetime( 2013, 2, 3))) == ujson.decode( ujson.encode(_TestObject("foo"), default_handler=my_obj_handler))) obj_list = [_TestObject("foo"), _TestObject("bar")] assert (json.loads(json.dumps(obj_list, default=str)) == ujson.decode( ujson.encode(obj_list, default_handler=str)))
def test_decimal_decode_test_precise(self): sut = {u("a"): 4.56} encoded = ujson.encode(sut) decoded = ujson.decode(encoded, precise_float=True) assert sut == decoded
def test_to_html_regression_GH6098(self): df = DataFrame({ u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')], 'données1': np.random.randn(5), 'données2': np.random.randn(5) }) # it works df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
def test_unicode_repr(self): mat = np.empty((N, 2), dtype=object) mat[:, 0] = 'foo' mat[:, 1] = 'bar' cols = ['b', u("\u05d0")] str_repr = repr(make_block(mat.T, cols, TEST_COLS))
def test_bytestring_with_unicode(self): df = Series([u("\u05d0")], name=u("\u05d1")) bytes(df)
def test_convert_accepts_unicode(self): r1 = self.dtc.convert("12:22", None, None) r2 = self.dtc.convert(u("12:22"), None, None) assert (r1 == r2), "DatetimeConverter.convert should accept unicode"
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame( [[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with tm.assert_raises_regex(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame( [[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame( [[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame( [[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame( [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df._consolidate() expected = DataFrame( [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame( [[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup tm.assert_raises_regex(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame( [[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame( [[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame( [[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame( [['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( { 'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422] }, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame( { 'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01] }, index=MultiIndex.from_tuples([(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename(columns={ 'TClose_x': 'TClose', 'TClose_y': 'QT_Close' }) str(result) result.dtypes expected = (DataFrame( [[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=[ 'RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close' ]).set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) pytest.raises(ValueError, df.reindex, columns=['bar']) pytest.raises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
def randu(n): choices = u("").join(map(unichr, lrange(1488, 1488 + 26))) choices += string.digits return ''.join([random.choice(choices) for _ in range(n)])
hash(c) # this will not raise @pytest.mark.parametrize("ll", [re.compile('ad')]) def test_is_re_passes(ll): assert inference.is_re(ll) @pytest.mark.parametrize("ll", ['x', 2, 3, object()]) def test_is_re_fails(ll): assert not inference.is_re(ll) @pytest.mark.parametrize("ll", [ r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), re.compile(r'') ]) def test_is_recompilable_passes(ll): assert inference.is_re_compilable(ll) @pytest.mark.parametrize("ll", [1, [], object()]) def test_is_recompilable_fails(ll): assert not inference.is_re_compilable(ll) class TestInference(object): def test_infer_dtype_bytes(self):
def test_missing_unicode_key(self): df = DataFrame({"a": [1]}) try: df.loc[:, u("\u05d0")] # should not raise UnicodeEncodeError except KeyError: pass # this is the expected exception
def test_unicode_repr_doesnt_raise(self): repr(create_mgr(u('b,\u05d0: object')))
def test_convert_accepts_unicode(self): r1 = self.pc.convert("2012-1-1", None, self.axis) r2 = self.pc.convert(u("2012-1-1"), None, self.axis) assert r1 == r2
def _write_header(self, indent): truncate_h = self.fmt.truncate_h row_levels = self.frame.index.nlevels if not self.fmt.header: # write nothing return indent def _column_header(): if self.fmt.index: row = [''] * (self.frame.index.nlevels - 1) else: row = [] if isinstance(self.columns, ABCMultiIndex): if self.fmt.has_column_names and self.fmt.index: row.append(single_column_table(self.columns.names)) else: row.append('') style = "text-align: {just};".format(just=self.fmt.justify) row.extend([ single_column_table(c, self.fmt.justify, style) for c in self.columns ]) else: if self.fmt.index: row.append(self.columns.name or '') row.extend(self.columns) return row self.write('<thead>', indent) indent += self.indent_delta if isinstance(self.columns, ABCMultiIndex): template = 'colspan="{span:d}" halign="left"' if self.fmt.sparsify: # GH3547 sentinel = com.sentinel_factory() else: sentinel = None levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): if truncate_h: # modify the header lines ins_col = self.fmt.tr_col_num if self.fmt.sparsify: recs_new = {} # Increment tags after ... col. for tag, span in list(records.items()): if tag >= ins_col: recs_new[tag + 1] = span elif tag + span > ins_col: recs_new[tag] = span + 1 if lnum == inner_lvl: values = (values[:ins_col] + (u('...'), ) + values[ins_col:]) else: # sparse col headers do not receive a ... values = (values[:ins_col] + (values[ins_col - 1], ) + values[ins_col:]) else: recs_new[tag] = span # if ins_col lies between tags, all col headers # get ... if tag + span == ins_col: recs_new[ins_col] = 1 values = (values[:ins_col] + (u('...'), ) + values[ins_col:]) records = recs_new inner_lvl = len(level_lengths) - 1 if lnum == inner_lvl: records[ins_col] = 1 else: recs_new = {} for tag, span in list(records.items()): if tag >= ins_col: recs_new[tag + 1] = span else: recs_new[tag] = span recs_new[ins_col] = 1 records = recs_new values = (values[:ins_col] + [u('...')] + values[ins_col:]) name = self.columns.names[lnum] row = [''] * (row_levels - 1) + [ '' if name is None else pprint_thing(name) ] if row == [""] and self.fmt.index is False: row = [] tags = {} j = len(row) for i, v in enumerate(values): if i in records: if records[i] > 1: tags[j] = template.format(span=records[i]) else: continue j += 1 row.append(v) self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) else: col_row = _column_header() align = self.fmt.justify if truncate_h: ins_col = row_levels + self.fmt.tr_col_num col_row.insert(ins_col, '...') self.write_tr(col_row, indent, self.indent_delta, header=True, align=align) if all((self.fmt.has_index_names, self.fmt.index, self.fmt.show_index_names)): row = ( [x if x is not None else '' for x in self.frame.index.names] + [''] * min(len(self.columns), self.max_cols)) if truncate_h: ins_col = row_levels + self.fmt.tr_col_num row.insert(ins_col, '') self.write_tr(row, indent, self.indent_delta, header=True) indent -= self.indent_delta self.write('</thead>', indent) return indent
def test_timtetonum_accepts_unicode(): assert (converter.time2num("00:01") == converter.time2num(u("00:01")))
def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): from pandas.core.format import Timedelta64Formatter return Timedelta64Formatter(values=self, nat_rep=na_rep, justify='all').get_result()
def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v frame = self.fmt.tr_frame ncols = len(frame.columns) nrows = len(frame) row_levels = self.frame.index.nlevels idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = lzip(*idx_values) if self.fmt.sparsify: # GH3547 sentinel = com.sentinel_factory() levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 if truncate_v: # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num inserted = False for lnum, records in enumerate(level_lengths): rec_new = {} for tag, span in list(records.items()): if tag >= ins_row: rec_new[tag + 1] = span elif tag + span > ins_row: rec_new[tag] = span + 1 # GH 14882 - Make sure insertion done once if not inserted: dot_row = list(idx_values[ins_row - 1]) dot_row[-1] = u('...') idx_values.insert(ins_row, tuple(dot_row)) inserted = True else: dot_row = list(idx_values[ins_row]) dot_row[inner_lvl - lnum] = u('...') idx_values[ins_row] = tuple(dot_row) else: rec_new[tag] = span # If ins_row lies between tags, all cols idx cols # receive ... if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: idx_values.insert( ins_row, tuple([u('...')] * len(level_lengths))) # GH 14882 - Place ... in correct level elif inserted: dot_row = list(idx_values[ins_row]) dot_row[inner_lvl - lnum] = u('...') idx_values[ins_row] = tuple(dot_row) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 for ix_col in range(len(fmt_values)): fmt_values[ix_col].insert(ins_row, '...') nrows += 1 for i in range(nrows): row = [] tags = {} sparse_offset = 0 j = 0 for records, v in zip(level_lengths, idx_values[i]): if i in records: if records[i] > 1: tags[j] = template.format(span=records[i]) else: sparse_offset += 1 continue j += 1 row.append(v) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: row.insert( row_levels - sparse_offset + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=tags, nindex_levels=len(levels) - sparse_offset) else: for i in range(len(frame)): idx_values = list( zip(*frame.index.format( sparsify=False, adjoin=False, names=False))) row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: row.insert(row_levels + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=frame.index.nlevels)
def test_tidy_repr(self): a = Series([u("\u05d0")] * 1000) a.name = 'title1' repr(a) # should not raise exception
def write_result(self, buf): """ Render a DataFrame to a LaTeX tabular/longtable environment output. """ # string representation of the columns if len(self.frame.columns) == 0 or len(self.frame.index) == 0: info_line = ( u('Empty {name}\nColumns: {col}\nIndex: {idx}').format( name=type(self.frame).__name__, col=self.frame.columns, idx=self.frame.index)) strcols = [[info_line]] else: strcols = self.fmt._to_str_columns() def get_col_type(dtype): if issubclass(dtype.type, np.number): return 'r' else: return 'l' # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, MultiIndex): out = self.frame.index.format(adjoin=False, sparsify=self.fmt.sparsify, names=self.fmt.has_index_names, na_rep=self.fmt.na_rep) # index.format will sparsify repeated entries with empty strings # so pad these with some empty space def pad_empties(x): for pad in reversed(x): if pad: break return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]] out = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels out = [[' ' * len(i[-1])] * clevels + i for i in out] # Add the column names to the last index column cnames = self.frame.columns.names if any(cnames): new_names = [i if i else '{}' for i in cnames] out[self.frame.index.nlevels - 1][:clevels] = new_names # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] column_format = self.column_format if column_format is None: dtypes = self.frame.dtypes._values column_format = ''.join(map(get_col_type, dtypes)) if self.fmt.index: index_format = 'l' * self.frame.index.nlevels column_format = index_format + column_format elif not isinstance(column_format, compat.string_types): # pragma: no cover raise AssertionError('column_format must be str or unicode, ' 'not {typ}'.format(typ=type(column_format))) if not self.longtable: buf.write( '\\begin{{tabular}}{{{fmt}}}\n'.format(fmt=column_format)) buf.write('\\toprule\n') else: buf.write( '\\begin{{longtable}}{{{fmt}}}\n'.format(fmt=column_format)) buf.write('\\toprule\n') ilevels = self.frame.index.nlevels clevels = self.frame.columns.nlevels nlevels = clevels if self.fmt.has_index_names and self.fmt.show_index_names: nlevels += 1 strrows = list(zip(*strcols)) self.clinebuf = [] for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: buf.write('\\midrule\n') # End of header if self.longtable: buf.write('\\endhead\n') buf.write('\\midrule\n') buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next ' 'page}}}} \\\\\n'.format(n=len(row))) buf.write('\\midrule\n') buf.write('\\endfoot\n\n') buf.write('\\bottomrule\n') buf.write('\\endlastfoot\n') if self.fmt.kwds.get('escape', True): # escape backslashes first crow = [(x.replace('\\', '\\textbackslash ').replace( '_', '\\_').replace('%', '\\%').replace( '$', '\\$').replace('#', '\\#').replace( '{', '\\{').replace('}', '\\}').replace( '~', '\\textasciitilde ').replace( '^', '\\textasciicircum ').replace( '&', '\\&') if (x and x != '{}') else '{}') for x in row] else: crow = [x if x else '{}' for x in row] if self.bold_rows and self.fmt.index: # bold row labels crow = [ '\\textbf{{{x}}}'.format(x=x) if j < ilevels and x.strip() not in ['', '{}'] else x for j, x in enumerate(crow) ] if i < clevels and self.fmt.header and self.multicolumn: # sum up columns to multicolumns crow = self._format_multicolumn(crow, ilevels) if (i >= nlevels and self.fmt.index and self.multirow and ilevels > 1): # sum up rows to multirows crow = self._format_multirow(crow, ilevels, i, strrows) buf.write(' & '.join(crow)) buf.write(' \\\\\n') if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) if not self.longtable: buf.write('\\bottomrule\n') buf.write('\\end{tabular}\n') else: buf.write('\\end{longtable}\n')
def test_plot(self): df = tm.makeTimeDataFrame() _check_plot_works(df.plot, grid=False) _check_plot_works(df.plot, subplots=True) _check_plot_works(df.plot, subplots=True, use_index=False) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) self._check_plot_fails(df.plot, kind='line', blarg=True) df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) _check_plot_works(df.plot, use_index=True) _check_plot_works(df.plot, sort_columns=False) _check_plot_works(df.plot, yticks=[1, 5, 10]) _check_plot_works(df.plot, xticks=[1, 5, 10]) _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) _check_plot_works(df.plot, subplots=True, title='blah') _check_plot_works(df.plot, title='blah') tuples = lzip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) _check_plot_works(df.plot, use_index=True) # unicode index = MultiIndex.from_tuples([(u('\u03b1'), 0), (u('\u03b1'), 1), (u('\u03b2'), 2), (u('\u03b2'), 3), (u('\u03b3'), 4), (u('\u03b3'), 5), (u('\u03b4'), 6), (u('\u03b4'), 7)], names=['i0', 'i1']) columns = MultiIndex.from_tuples([('bar', u('\u0394')), ('bar', u('\u0395'))], names=['c0', 'c1']) df = DataFrame(np.random.randint(0, 10, (8, 2)), columns=columns, index=index) _check_plot_works(df.plot, title=u('\u03A3'))
def setUp(self): from pandas.io.tests.generate_legacy_pickles import create_data self.data = create_data() self.path = u('__%s__.pickle' % tm.rands(10))
def test_bytestring_with_unicode(self): df = DataFrame({'A': [u("\u05d0")]}) if compat.PY3: bytes(df) else: str(df)
def test_encode_dict_with_unicode_keys(self, unicode_key): unicode_dict = {unicode_key: u("value1")} assert unicode_dict == ujson.decode(ujson.encode(unicode_dict))
def test_repr_with_unicode_data(): with pd.core.config.option_context("display.encoding", 'UTF-8'): d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} index = pd.DataFrame(d).set_index(["a", "b"]).index assert "\\u" not in repr(index) # we don't want unicode-escaped
def encode(obj): """ Data encoder """ tobj = type(obj) if isinstance(obj, Index): if isinstance(obj, RangeIndex): return { u'typ': u'range_index', u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'start': getattr(obj, '_start', None), u'stop': getattr(obj, '_stop', None), u'step': getattr(obj, '_step', None) } elif isinstance(obj, PeriodIndex): return { u'typ': u'period_index', u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'freq': u_safe(getattr(obj, 'freqstr', None)), u'dtype': u(obj.dtype.name), u'data': convert(obj.asi8), u'compress': compressor } elif isinstance(obj, DatetimeIndex): tz = getattr(obj, 'tz', None) # store tz info and data as UTC if tz is not None: tz = u(tz.zone) obj = obj.tz_convert('UTC') return { u'typ': u'datetime_index', u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'dtype': u(obj.dtype.name), u'data': convert(obj.asi8), u'freq': u_safe(getattr(obj, 'freqstr', None)), u'tz': tz, u'compress': compressor } elif isinstance(obj, (IntervalIndex, IntervalArray)): if isinstance(obj, IntervalIndex): typ = u'interval_index' else: typ = u'interval_array' return { u'typ': typ, u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'left': getattr(obj, 'left', None), u'right': getattr(obj, 'right', None), u'closed': getattr(obj, 'closed', None) } elif isinstance(obj, MultiIndex): return { u'typ': u'multi_index', u'klass': u(obj.__class__.__name__), u'names': getattr(obj, 'names', None), u'dtype': u(obj.dtype.name), u'data': convert(obj.values), u'compress': compressor } else: return { u'typ': u'index', u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'dtype': u(obj.dtype.name), u'data': convert(obj.values), u'compress': compressor } elif isinstance(obj, Categorical): return { u'typ': u'category', u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'codes': obj.codes, u'categories': obj.categories, u'ordered': obj.ordered, u'compress': compressor } elif isinstance(obj, Series): if isinstance(obj, SparseSeries): raise NotImplementedError( 'msgpack sparse series is not implemented') # d = {'typ': 'sparse_series', # 'klass': obj.__class__.__name__, # 'dtype': obj.dtype.name, # 'index': obj.index, # 'sp_index': obj.sp_index, # 'sp_values': convert(obj.sp_values), # 'compress': compressor} # for f in ['name', 'fill_value', 'kind']: # d[f] = getattr(obj, f, None) # return d else: return { u'typ': u'series', u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), u'index': obj.index, u'dtype': u(obj.dtype.name), u'data': convert(obj.values), u'compress': compressor } elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): raise NotImplementedError( 'msgpack sparse frame is not implemented') # d = {'typ': 'sparse_dataframe', # 'klass': obj.__class__.__name__, # 'columns': obj.columns} # for f in ['default_fill_value', 'default_kind']: # d[f] = getattr(obj, f, None) # d['data'] = dict([(name, ss) # for name, ss in compat.iteritems(obj)]) # return d else: data = obj._data if not data.is_consolidated(): data = data.consolidate() # the block manager return { u'typ': u'block_manager', u'klass': u(obj.__class__.__name__), u'axes': data.axes, u'blocks': [{ u'locs': b.mgr_locs.as_array, u'values': convert(b.values), u'shape': b.values.shape, u'dtype': u(b.dtype.name), u'klass': u(b.__class__.__name__), u'compress': compressor } for b in data.blocks] } elif isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64)) or obj is NaT: if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: tz = u(tz.zone) freq = obj.freq if freq is not None: freq = u(freq.freqstr) return { u'typ': u'timestamp', u'value': obj.value, u'freq': freq, u'tz': tz } if obj is NaT: return {u'typ': u'nat'} elif isinstance(obj, np.timedelta64): return {u'typ': u'timedelta64', u'data': obj.view('i8')} elif isinstance(obj, timedelta): return { u'typ': u'timedelta', u'data': (obj.days, obj.seconds, obj.microseconds) } elif isinstance(obj, np.datetime64): return {u'typ': u'datetime64', u'data': u(str(obj))} elif isinstance(obj, datetime): return {u'typ': u'datetime', u'data': u(obj.isoformat())} elif isinstance(obj, date): return {u'typ': u'date', u'data': u(obj.isoformat())} raise Exception("cannot encode this datetimelike object: %s" % obj) elif isinstance(obj, Period): return { u'typ': u'period', u'ordinal': obj.ordinal, u'freq': u_safe(obj.freqstr) } elif isinstance(obj, Interval): return { u'typ': u'interval', u'left': obj.left, u'right': obj.right, u'closed': obj.closed } elif isinstance(obj, BlockIndex): return { u'typ': u'block_index', u'klass': u(obj.__class__.__name__), u'blocs': obj.blocs, u'blengths': obj.blengths, u'length': obj.length } elif isinstance(obj, IntIndex): return { u'typ': u'int_index', u'klass': u(obj.__class__.__name__), u'indices': obj.indices, u'length': obj.length } elif isinstance(obj, np.ndarray): return { u'typ': u'ndarray', u'shape': obj.shape, u'ndim': obj.ndim, u'dtype': u(obj.dtype.name), u'data': convert(obj), u'compress': compressor } elif isinstance(obj, np.number): if np.iscomplexobj(obj): return { u'typ': u'np_scalar', u'sub_typ': u'np_complex', u'dtype': u(obj.dtype.name), u'real': u(obj.real.__repr__()), u'imag': u(obj.imag.__repr__()) } else: return { u'typ': u'np_scalar', u'dtype': u(obj.dtype.name), u'data': u(obj.__repr__()) } elif isinstance(obj, complex): return { u'typ': u'np_complex', u'real': u(obj.real.__repr__()), u'imag': u(obj.imag.__repr__()) } return obj
if compat.PY3 or ord(path_or_buf[0]) >= 0x80: fh = compat.BytesIO(path_or_buf) return read(fh) finally: if fh is not None: fh.close() elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read): # treat as a buffer like return read(path_or_buf) raise ValueError('path_or_buf needs to be a string file path or file-like') dtype_dict = { 21: np.dtype('M8[ns]'), u('datetime64[ns]'): np.dtype('M8[ns]'), u('datetime64[us]'): np.dtype('M8[us]'), 22: np.dtype('m8[ns]'), u('timedelta64[ns]'): np.dtype('m8[ns]'), u('timedelta64[us]'): np.dtype('m8[us]'), # this is platform int, which we need to remap to np.int64 # for compat on windows platforms 7: np.dtype('int64'), 'category': 'category' } def dtype_for(t): """ return my dtype mapping, whether number or name """ if t in dtype_dict:
def test_list_mixed(self): x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec)
def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ def stringify(value): if self.encoding is not None: encoder = partial(com.pprint_thing_encoded, encoding=self.encoding) else: encoder = com.pprint_thing return encoder(value) kind = _ensure_decoded(self.kind) meta = _ensure_decoded(self.meta) if kind == u('datetime64') or kind == u('datetime'): if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) v = pd.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') return TermValue(v, v.value, kind) elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date')): v = time.mktime(v.timetuple()) return TermValue(v, pd.Timestamp(v), kind) elif kind == u('timedelta64') or kind == u('timedelta'): v = _coerce_scalar_to_timedelta_type(v, unit='s').value return TermValue(int(v), v, kind) elif meta == u('category'): metadata = com._values_from_object(self.metadata) result = metadata.searchsorted(v, side='left') return TermValue(result, result, u('integer')) elif kind == u('integer'): v = int(float(v)) return TermValue(v, v, kind) elif kind == u('float'): v = float(v) return TermValue(v, v, kind) elif kind == u('bool'): if isinstance(v, string_types): v = not v.strip().lower() in [ u('false'), u('f'), u('no'), u('n'), u('none'), u('0'), u('[]'), u('{}'), u('') ] else: v = bool(v) return TermValue(v, v, kind) elif not isinstance(v, string_types): v = stringify(v) return TermValue(v, stringify(v), u('string')) # string quoting return TermValue(v, stringify(v), u('string'))