Example #1
0
    def test_astype_str(self):
        # GH4405
        digits = string.digits
        s1 = Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)])
        s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0])
        types = (compat.text_type, np.str_)
        for typ in types:
            for s in (s1, s2):
                res = s.astype(typ)
                expec = s.map(compat.text_type)
                assert_series_equal(res, expec)

        # GH9757
        # Test str and unicode on python 2.x and just str on python 3.x
        for tt in set([str, compat.text_type]):
            ts = Series([Timestamp('2010-01-04 00:00:00')])
            s = ts.astype(tt)
            expected = Series([tt('2010-01-04')])
            assert_series_equal(s, expected)

            ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')])
            s = ts.astype(tt)
            expected = Series([tt('2010-01-04 00:00:00-05:00')])
            assert_series_equal(s, expected)

            td = Series([Timedelta(1, unit='d')])
            s = td.astype(tt)
            expected = Series([tt('1 days 00:00:00.000000000')])
            assert_series_equal(s, expected)
Example #2
0
    def test_astype_str(self):
        # GH4405
        digits = string.digits
        s1 = Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)])
        s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0])
        types = (compat.text_type, np.str_)
        for typ in types:
            for s in (s1, s2):
                res = s.astype(typ)
                expec = s.map(compat.text_type)
                assert_series_equal(res, expec)

        # GH9757
        # Test str and unicode on python 2.x and just str on python 3.x
        for tt in set([str, compat.text_type]):
            ts = Series([Timestamp('2010-01-04 00:00:00')])
            s = ts.astype(tt)
            expected = Series([tt('2010-01-04')])
            assert_series_equal(s, expected)

            ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')])
            s = ts.astype(tt)
            expected = Series([tt('2010-01-04 00:00:00-05:00')])
            assert_series_equal(s, expected)

            td = Series([Timedelta(1, unit='d')])
            s = td.astype(tt)
            expected = Series([tt('1 days 00:00:00.000000000')])
            assert_series_equal(s, expected)
Example #3
0
    def test_astype_unicode(self):
        # see gh-7758: A bit of magic is required to set
        # default encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10,
                    tm.rands(63),
                    tm.rands(64),
                    tm.rands(1000)]),
            Series(['データーサイエンス、お前はもう死んでいる']),
        ]

        former_encoding = None

        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series(['野菜食べないとやばい'.encode("utf-8")]))

        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(str)
            tm.assert_series_equal(res, expec)

        # Restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)
            sys.setdefaultencoding(former_encoding)
Example #4
0
    def test_astype_unicode(self):

        # GH7758
        # a bit of magic is required to set default encoding encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10,
                    tm.rands(63),
                    tm.rands(64),
                    tm.rands(1000)]),
            Series([u('データーサイエンス、お前はもう死んでいる')]),
        ]

        former_encoding = None
        if not compat.PY3:
            # in python we can force the default encoding for this test
            former_encoding = sys.getdefaultencoding()
            reload(sys)  # noqa
            sys.setdefaultencoding("utf-8")
        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series([u('野菜食べないとやばい').encode("utf-8")]))
        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(compat.text_type)
            assert_series_equal(res, expec)
        # restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)  # noqa
            sys.setdefaultencoding(former_encoding)
Example #5
0
    def test_astype_unicode(self):

        # GH7758
        # a bit of magic is required to set default encoding encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10,
                    tm.rands(63),
                    tm.rands(64),
                    tm.rands(1000)]),
            Series([u('データーサイエンス、お前はもう死んでいる')]),
        ]

        former_encoding = None
        if not compat.PY3:
            # in python we can force the default encoding for this test
            former_encoding = sys.getdefaultencoding()
            reload(sys)  # noqa
            sys.setdefaultencoding("utf-8")
        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series([u('野菜食べないとやばい').encode("utf-8")]))
        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(compat.text_type)
            assert_series_equal(res, expec)
        # restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)  # noqa
            sys.setdefaultencoding(former_encoding)
Example #6
0
    def test_repr_truncation(self):
        max_len = 20
        with option_context("display.max_colwidth", max_len):
            df = DataFrame(
                {
                    "A": np.random.randn(10),
                    "B": [tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10)],
                }
            )
            r = repr(df)
            r = r[r.find("\n") + 1 :]

            _strlen = fmt._strlen_func()

            for line, value in zip(r.split("\n"), df["B"]):
                if _strlen(value) + 1 > max_len:
                    self.assert_("..." in line)
                else:
                    self.assert_("..." not in line)

        with option_context("display.max_colwidth", 999999):
            self.assert_("..." not in repr(df))

        with option_context("display.max_colwidth", max_len + 2):
            self.assert_("..." not in repr(df))
Example #7
0
    def test_timestamp_compare(self):
        # make sure we can compare Timestamps on the right AND left hand side
        # GH4982
        df = DataFrame(
            {
                "dates1": date_range("20010101", periods=10),
                "dates2": date_range("20010102", periods=10),
                "intcol": np.random.randint(1000000000, size=10),
                "floatcol": np.random.randn(10),
                "stringcol": list(tm.rands(10)),
            }
        )
        df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT
        ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"}
        for left, right in ops.items():
            left_f = getattr(operator, left)
            right_f = getattr(operator, right)

            # no nats
            expected = left_f(df, Timestamp("20010109"))
            result = right_f(Timestamp("20010109"), df)
            assert_frame_equal(result, expected)

            # nats
            expected = left_f(df, Timestamp("nat"))
            result = right_f(Timestamp("nat"), df)
            assert_frame_equal(result, expected)
    def test_timestamp_compare(self):
        # make sure we can compare Timestamps on the right AND left hand side
        # GH4982
        df = DataFrame({
            'dates1': date_range('20010101', periods=10),
            'dates2': date_range('20010102', periods=10),
            'intcol': np.random.randint(1000000000, size=10),
            'floatcol': np.random.randn(10),
            'stringcol': list(tm.rands(10))
        })
        df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT
        ops = {
            'gt': 'lt',
            'lt': 'gt',
            'ge': 'le',
            'le': 'ge',
            'eq': 'eq',
            'ne': 'ne'
        }

        for left, right in ops.items():
            left_f = getattr(operator, left)
            right_f = getattr(operator, right)

            # no nats
            expected = left_f(df, Timestamp('20010109'))
            result = right_f(Timestamp('20010109'), df)
            assert_frame_equal(result, expected)

            # nats
            expected = left_f(df, Timestamp('nat'))
            result = right_f(Timestamp('nat'), df)
            assert_frame_equal(result, expected)
Example #9
0
def generate_dict_strings(string_size, nunique, length, random_order=True):
    uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
    if random_order:
        indices = np.random.randint(0, nunique, size=length).astype('i4')
    else:
        indices = np.arange(nunique).astype('i4').repeat(length // nunique)
    return pa.DictionaryArray.from_arrays(indices, uniques)
def generate_strings(string_size, nunique, length, random_order=True):
    uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
    if random_order:
        indices = np.random.randint(0, nunique, size=length).astype('i4')
        return uniques.take(indices)
    else:
        return uniques.repeat(length // nunique)
Example #11
0
    def test_repr_truncation(self):
        max_len = 20
        with option_context("display.max_colwidth", max_len):
            df = DataFrame({
                'A':
                np.random.randn(10),
                'B': [
                    tm.rands(np.random.randint(max_len - 1, max_len + 1))
                    for i in range(10)
                ]
            })
            r = repr(df)
            r = r[r.find('\n') + 1:]

            _strlen = fmt._strlen_func()

            for line, value in zip(r.split('\n'), df['B']):
                if _strlen(value) + 1 > max_len:
                    self.assert_('...' in line)
                else:
                    self.assert_('...' not in line)

        with option_context("display.max_colwidth", 999999):
            self.assert_('...' not in repr(df))

        with option_context("display.max_colwidth", max_len + 2):
            self.assert_('...' not in repr(df))
Example #12
0
    def test_wide_repr_multiindex_cols(self):
        set_option('test.interactive', True)
        col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
        midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)),
                                              np.array(col(10, 5))])
        mcols = pandas.MultiIndex.from_arrays([np.array(col(20, 3)),
                                               np.array(col(20, 3))])
        df = DataFrame([col(20, 25) for _ in range(10)],
                       index=midx, columns=mcols)
        df.index.names = ['Level 0', 'Level 1']
        set_option('print.expand_frame_repr', False)
        rep_str = repr(df)
        set_option('print.expand_frame_repr', True)
        wide_repr = repr(df)
        self.assert_(rep_str != wide_repr)

        set_option('print.line_width', 120)
        wider_repr = repr(df)
        self.assert_(len(wider_repr) < len(wide_repr))

        self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1)

        reset_option('print.expand_frame_repr')
        set_option('test.interactive', False)
        set_option('print.line_width', 80)
Example #13
0
    def test_wide_repr_multiindex_cols(self):
        with option_context('mode.sim_interactive', True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            midx = pandas.MultiIndex.from_arrays(
                [np.array(col(10, 5)),
                 np.array(col(10, 5))])
            mcols = pandas.MultiIndex.from_arrays(
                [np.array(col(20, 3)),
                 np.array(col(20, 3))])
            df = DataFrame([col(20, 25) for _ in range(10)],
                           index=midx,
                           columns=mcols)
            df.index.names = ['Level 0', 'Level 1']
            set_option('display.expand_frame_repr', False)
            rep_str = repr(df)
            set_option('display.expand_frame_repr', True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

        with option_context('display.line_width', 120):
            wider_repr = repr(df)
            self.assert_(len(wider_repr) < len(wide_repr))
            self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1)

        reset_option('display.expand_frame_repr')
Example #14
0
    def test_timestamp_compare(self):
        # make sure we can compare Timestamps on the right AND left hand side
        # GH#4982
        df = pd.DataFrame(
            {
                "dates1": pd.date_range("20010101", periods=10),
                "dates2": pd.date_range("20010102", periods=10),
                "intcol": np.random.randint(1000000000, size=10),
                "floatcol": np.random.randn(10),
                "stringcol": list(tm.rands(10)),
            }
        )
        df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT
        ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"}

        for left, right in ops.items():
            left_f = getattr(operator, left)
            right_f = getattr(operator, right)

            # no nats
            if left in ["eq", "ne"]:
                expected = left_f(df, pd.Timestamp("20010109"))
                result = right_f(pd.Timestamp("20010109"), df)
                tm.assert_frame_equal(result, expected)
            else:
                with pytest.raises(TypeError):
                    left_f(df, pd.Timestamp("20010109"))
                with pytest.raises(TypeError):
                    right_f(pd.Timestamp("20010109"), df)
            # nats
            expected = left_f(df, pd.Timestamp("nat"))
            result = right_f(pd.Timestamp("nat"), df)
            tm.assert_frame_equal(result, expected)
Example #15
0
    def test_timestamp_compare(self):
        # make sure we can compare Timestamps on the right AND left hand side
        # GH4982
        df = DataFrame({'dates1': date_range('20010101', periods=10),
                        'dates2': date_range('20010102', periods=10),
                        'intcol': np.random.randint(1000000000, size=10),
                        'floatcol': np.random.randn(10),
                        'stringcol': list(tm.rands(10))})
        df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT
        ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq',
               'ne': 'ne'}

        for left, right in ops.items():
            left_f = getattr(operator, left)
            right_f = getattr(operator, right)

            # no nats
            expected = left_f(df, Timestamp('20010109'))
            result = right_f(Timestamp('20010109'), df)
            assert_frame_equal(result, expected)

            # nats
            expected = left_f(df, Timestamp('nat'))
            result = right_f(Timestamp('nat'), df)
            assert_frame_equal(result, expected)
Example #16
0
    def test_roundtrip_indexlabels(self):
        _skip_if_no_xlrd()
        ext = self.ext
        path = '__tmp_to_excel_from_excel_indexlabels__.' + ext

        with ensure_clean(path) as path:

            self.frame['A'][:5] = nan

            self.frame.to_excel(path, 'test1')
            self.frame.to_excel(path, 'test1', cols=['A', 'B'])
            self.frame.to_excel(path, 'test1', header=False)
            self.frame.to_excel(path, 'test1', index=False)

            # test index_label
            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label=['test'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path,
                           'test1',
                           index_label=['test', 'dummy', 'dummy2'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label='test')
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

        # test index_labels in same row as column names
        path = '%s.%s' % (tm.rands(10), ext)

        with ensure_clean(path) as path:

            self.frame.to_excel(path,
                                'test1',
                                cols=['A', 'B', 'C', 'D'],
                                index=False)
            # take 'A' and 'B' as indexes (they are in same row as cols 'C',
            # 'D')
            df = self.frame.copy()
            df = df.set_index(['A', 'B'])

            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=[0, 1])
            tm.assert_frame_equal(df, recons)
Example #17
0
 def setUp(self):
     from pandas.io.tests.generate_legacy_storage_files import (
         create_msgpack_data, create_data)
     self.data = create_msgpack_data()
     self.all_data = create_data()
     self.path = u('__%s__.msgpack' % tm.rands(10))
     self.minimum_structure = {'series': ['float', 'int', 'mixed', 'ts', 'mi', 'dup'],
                               'frame': ['float', 'int', 'mixed', 'mi'],
                               'panel': ['float'],
                               'index': ['int', 'date', 'period'],
                               'mi': ['reg2']}
 def setUp(self):
     from pandas.io.tests.generate_legacy_storage_files import (
         create_msgpack_data, create_data)
     self.data = create_msgpack_data()
     self.all_data = create_data()
     self.path = u('__%s__.msgpack' % tm.rands(10))
     self.minimum_structure = {'series': ['float', 'int', 'mixed', 'ts', 'mi', 'dup'],
                               'frame': ['float', 'int', 'mixed', 'mi'],
                               'panel': ['float'],
                               'index': ['int', 'date', 'period'],
                               'mi': ['reg2']}
Example #19
0
    def setup(self):
        nuniques = 100000
        value_size = 50
        length = 1000000
        num_cols = 10

        unique_values = np.array([rands(value_size) for i in range(nuniques)],
                                 dtype='O')
        values = unique_values[np.random.randint(0, nuniques, size=length)]
        self.table = pa.table([pa.array(values) for i in range(num_cols)],
                              names=['f{}'.format(i) for i in range(num_cols)])
        self.table_df = self.table.to_pandas()
Example #20
0
    def _check_extension_indexlabels(self, ext):
        path = '__tmp_to_excel_from_excel_indexlabels__.' + ext
        try:
            self.frame['A'][:5] = nan

            self.frame.to_excel(path, 'test1')
            self.frame.to_excel(path, 'test1', cols=['A', 'B'])
            self.frame.to_excel(path, 'test1', header=False)
            self.frame.to_excel(path, 'test1', index=False)

            # test index_label
            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label=['test'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path,
                           'test1',
                           index_label=['test', 'dummy', 'dummy2'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label='test')
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)
        finally:
            os.remove(path)

        # test index_labels in same row as column names
        path = '%s.xls' % tm.rands(10)
        try:
            self.frame.to_excel(path,
                                'test1',
                                cols=['A', 'B', 'C', 'D'],
                                index=False)
            # take 'A' and 'B' as indexes (they are in same row as cols 'C',
            # 'D')
            df = self.frame.copy()
            df = df.set_index(['A', 'B'])

            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=[0, 1])
            tm.assert_frame_equal(df, recons)
        finally:
            os.remove(path)
Example #21
0
    def test_roundtrip_indexlabels(self):
        _skip_if_no_xlrd()
        ext = self.ext
        path = '__tmp_to_excel_from_excel_indexlabels__.' + ext

        with ensure_clean(path) as path:

            self.frame['A'][:5] = nan

            self.frame.to_excel(path, 'test1')
            self.frame.to_excel(path, 'test1', cols=['A', 'B'])
            self.frame.to_excel(path, 'test1', header=False)
            self.frame.to_excel(path, 'test1', index=False)

            # test index_label
            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label=['test'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(
                path, 'test1', index_label=['test', 'dummy', 'dummy2'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label='test')
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

        # test index_labels in same row as column names
        path = '%s.%s' % (tm.rands(10), ext)

        with ensure_clean(path) as path:

            self.frame.to_excel(path, 'test1',
                                cols=['A', 'B', 'C', 'D'], index=False)
            # take 'A' and 'B' as indexes (they are in same row as cols 'C',
            # 'D')
            df = self.frame.copy()
            df = df.set_index(['A', 'B'])

            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=[0, 1])
            tm.assert_frame_equal(df, recons)
Example #22
0
def generate_csv_files():
    N = 10
    nfiles = 10
    df = pd.DataFrame({'foo': [tm.rands(10) for _ in xrange(N)],
                       'bar': np.random.randn(N),
                       'baz': np.random.randint(0, 100, size=N)},
                      columns=['foo', 'bar', 'baz'])
    csv_base = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'csv')
    os.mkdir(csv_base)
    for i in xrange(nfiles):
        csv_path = pjoin(csv_base, '{0}.csv'.format(i))
        print('Writing {0}'.format(csv_path))
        df.to_csv(csv_path, index=False, header=False)
Example #23
0
    def setUp(self):
        from pandas.io.tests.generate_legacy_storage_files import create_msgpack_data, create_data

        self.data = create_msgpack_data()
        self.all_data = create_data()
        self.path = u("__%s__.msgpack" % tm.rands(10))
        self.minimum_structure = {
            "series": ["float", "int", "mixed", "ts", "mi", "dup"],
            "frame": ["float", "int", "mixed", "mi"],
            "panel": ["float"],
            "index": ["int", "date", "period"],
            "mi": ["reg2"],
        }
Example #24
0
def generate_csv_files():
    N = 10
    nfiles = 10
    df = pd.DataFrame({'foo': [tm.rands(10) for _ in xrange(N)],
                       'bar': np.random.randn(N),
                       'baz': np.random.randint(0, 100, size=N)},
                      columns=['foo', 'bar', 'baz'])
    csv_base = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'csv')
    os.mkdir(csv_base)
    for i in xrange(nfiles):
        csv_path = pjoin(csv_base, '{0}.csv'.format(i))
        print('Writing {0}'.format(csv_path))
        df.to_csv(csv_path, index=False, header=False)
Example #25
0
    def test_series_frame_radd_bug(self):
        from pandas.util.testing import rands

        # GH 353
        vals = Series([rands(5) for _ in xrange(10)])
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        assert_series_equal(result, expected)

        frame = DataFrame({'vals' : vals})
        result = 'foo_' + frame
        expected = DataFrame({'vals' : vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)
Example #26
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = np.array([rands(10) for _ in xrange(10000)], dtype="O")
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})

        df2 = DataFrame({"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)})

        # just to hit the label compression code path
        merged = merge(df, df2, how="outer")
Example #27
0
    def _check_extension_indexlabels(self, ext):
        path = '__tmp_to_excel_from_excel_indexlabels__.' + ext
        try:
            self.frame['A'][:5] = nan

            self.frame.to_excel(path, 'test1')
            self.frame.to_excel(path, 'test1', cols=['A', 'B'])
            self.frame.to_excel(path, 'test1', header=False)
            self.frame.to_excel(path, 'test1', index=False)

            # test index_label
            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label=['test'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(
                path, 'test1', index_label=['test', 'dummy', 'dummy2'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)

            frame = (DataFrame(np.random.randn(10, 2)) >= 0)
            frame.to_excel(path, 'test1', index_label='test')
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=0).astype(np.int64)
            frame.index.names = ['test']
            self.assertEqual(frame.index.names, recons.index.names)
        finally:
            os.remove(path)

        # test index_labels in same row as column names
        path = '%s.xls' % tm.rands(10)
        try:
            self.frame.to_excel(path, 'test1',
                                cols=['A', 'B', 'C', 'D'], index=False)
            # take 'A' and 'B' as indexes (they are in same row as cols 'C',
            # 'D')
            df = self.frame.copy()
            df = df.set_index(['A', 'B'])

            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=[0, 1])
            tm.assert_frame_equal(df, recons)
        finally:
            os.remove(path)
Example #28
0
    def test_wide_repr(self):
        with option_context('mode.sim_interactive', True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            df = DataFrame([col(20, 25) for _ in range(10)])
            set_option('display.expand_frame_repr', False)
            rep_str = repr(df)
            set_option('display.expand_frame_repr', True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

            with option_context('display.line_width', 120):
                wider_repr = repr(df)
                self.assert_(len(wider_repr) < len(wide_repr))

        reset_option('display.expand_frame_repr')
Example #29
0
def test_leak3():
    import pyarrow.parquet as pq

    df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4] for i in range(50)})
    table = pa.Table.from_pandas(df, preserve_index=False)

    writer = pq.ParquetWriter('leak_test_' + tm.rands(5) + '.parquet',
                              table.schema)

    def func():
        writer.write_table(table, row_group_size=len(table))

    # This does not "leak" per se but we do want to have this use as little
    # memory as possible
    assert_does_not_leak(func, iterations=500, check_interval=50, tolerance=20)
Example #30
0
    def test_astype_unicode(self):
        # see gh-7758: A bit of magic is required to set
        # default encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
            Series(['データーサイエンス、お前はもう死んでいる']),
        ]

        former_encoding = None

        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series(['野菜食べないとやばい'
                                       .encode("utf-8")]))

        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(str)
            tm.assert_series_equal(res, expec)

        # Restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)
            sys.setdefaultencoding(former_encoding)
Example #31
0
    def test_wide_repr(self):
        with option_context('mode.sim_interactive', True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            df = DataFrame([col(20, 25) for _ in range(10)])
            set_option('display.expand_frame_repr', False)
            rep_str = repr(df)
            set_option('display.expand_frame_repr', True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

            with option_context('display.line_width', 120):
                wider_repr = repr(df)
                self.assert_(len(wider_repr) < len(wide_repr))

        reset_option('display.expand_frame_repr')
Example #32
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = np.array([rands(10) for _ in xrange(10000)], dtype='O')
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({'key1' : key1, 'key2' : key2,
                        'value1' : np.random.randn(20000)})

        df2 = DataFrame({'key1' : key1[::2], 'key2' : key2[::2],
                         'value2' : np.random.randn(10000)})

        # just to hit the label compression code path
        merged = merge(df, df2, how='outer')
Example #33
0
    def _check_extension_indexlabels(self, ext):
        path = "__tmp_to_excel_from_excel_indexlabels__." + ext

        with ensure_clean(path) as path:

            self.frame["A"][:5] = nan

            self.frame.to_excel(path, "test1")
            self.frame.to_excel(path, "test1", cols=["A", "B"])
            self.frame.to_excel(path, "test1", header=False)
            self.frame.to_excel(path, "test1", index=False)

            # test index_label
            frame = DataFrame(np.random.randn(10, 2)) >= 0
            frame.to_excel(path, "test1", index_label=["test"])
            reader = ExcelFile(path)
            recons = reader.parse("test1", index_col=0).astype(np.int64)
            frame.index.names = ["test"]
            self.assertEqual(frame.index.names, recons.index.names)

            frame = DataFrame(np.random.randn(10, 2)) >= 0
            frame.to_excel(path, "test1", index_label=["test", "dummy", "dummy2"])
            reader = ExcelFile(path)
            recons = reader.parse("test1", index_col=0).astype(np.int64)
            frame.index.names = ["test"]
            self.assertEqual(frame.index.names, recons.index.names)

            frame = DataFrame(np.random.randn(10, 2)) >= 0
            frame.to_excel(path, "test1", index_label="test")
            reader = ExcelFile(path)
            recons = reader.parse("test1", index_col=0).astype(np.int64)
            frame.index.names = ["test"]
            self.assertEqual(frame.index.names, recons.index.names)

        # test index_labels in same row as column names
        path = "%s.xls" % tm.rands(10)

        with ensure_clean(path) as path:

            self.frame.to_excel(path, "test1", cols=["A", "B", "C", "D"], index=False)
            # take 'A' and 'B' as indexes (they are in same row as cols 'C',
            # 'D')
            df = self.frame.copy()
            df = df.set_index(["A", "B"])

            reader = ExcelFile(path)
            recons = reader.parse("test1", index_col=[0, 1])
            tm.assert_frame_equal(df, recons)
Example #34
0
    def test_excel_roundtrip_indexname(self):
        _skip_if_no_xlrd()

        path = '%s.%s' % (tm.rands(10), self.ext)

        df = DataFrame(np.random.randn(10, 4))
        df.index.name = 'foo'

        with ensure_clean(path) as path:
            df.to_excel(path)

            xf = ExcelFile(path)
            result = xf.parse(xf.sheet_names[0], index_col=0)

            tm.assert_frame_equal(result, df)
            self.assertEqual(result.index.name, 'foo')
Example #35
0
    def test_excel_roundtrip_indexname(self):
        _skip_if_no_xlrd()

        path = '%s.%s' % (tm.rands(10), self.ext)

        df = DataFrame(np.random.randn(10, 4))
        df.index.name = 'foo'

        with ensure_clean(path) as path:
            df.to_excel(path)

            xf = ExcelFile(path)
            result = xf.parse(xf.sheet_names[0], index_col=0)

            tm.assert_frame_equal(result, df)
            self.assertEqual(result.index.name, 'foo')
Example #36
0
def _test_dataframe(size=10000, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'uint8': _random_integers(size, np.uint8),
        'uint16': _random_integers(size, np.uint16),
        'uint32': _random_integers(size, np.uint32),
        'uint64': _random_integers(size, np.uint64),
        'int8': _random_integers(size, np.int8),
        'int16': _random_integers(size, np.int16),
        'int32': _random_integers(size, np.int32),
        'int64': _random_integers(size, np.int64),
        'float32': np.random.randn(size).astype(np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': [tm.rands(10) for i in range(size)]
    })
    return df
Example #37
0
def _test_dataframe(size=10000, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'uint8': _random_integers(size, np.uint8),
        'uint16': _random_integers(size, np.uint16),
        'uint32': _random_integers(size, np.uint32),
        'uint64': _random_integers(size, np.uint64),
        'int8': _random_integers(size, np.int8),
        'int16': _random_integers(size, np.int16),
        'int32': _random_integers(size, np.int32),
        'int64': _random_integers(size, np.int64),
        'float32': np.random.randn(size).astype(np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': [tm.rands(10) for i in range(size)]
    })
    return df
Example #38
0
    def test_wide_repr(self):
        set_option('test.interactive', True)
        col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
        df = DataFrame([col(20, 25) for _ in range(10)])
        set_option('print.expand_frame_repr', False)
        rep_str = repr(df)
        set_option('print.expand_frame_repr', True)
        wide_repr = repr(df)
        self.assert_(rep_str != wide_repr)

        set_option('print.line_width', 120)
        wider_repr = repr(df)
        self.assert_(len(wider_repr) < len(wide_repr))

        reset_option('print.expand_frame_repr')
        set_option('test.interactive', False)
        set_option('print.line_width', 80)
Example #39
0
    def test_repr_truncation(self):
        max_len = 20
        fmt.print_config.max_colwidth = max_len
        df = DataFrame(
            {"A": np.random.randn(10), "B": [tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10)]}
        )
        r = repr(df)
        r = r[r.find("\n") + 1 :]
        for line, value in zip(r.split("\n"), df["B"]):
            if fmt._strlen(value) + 1 > max_len:
                self.assert_("..." in line)
            else:
                self.assert_("..." not in line)

        fmt.print_config.max_colwidth = None
        self.assert_("..." not in repr(df))

        fmt.print_config.max_colwidth = max_len + 2
        self.assert_("..." not in repr(df))
Example #40
0
    def test_repr_truncation(self):
        max_len = 20
        fmt.print_config.max_colwidth = max_len
        df = DataFrame({'A': np.random.randn(10),
                 'B': [tm.rands(np.random.randint(max_len - 1,
                     max_len + 1)) for i in range(10)]})
        r = repr(df)
        r = r[r.find('\n') + 1:]
        for line, value in zip(r.split('\n'), df['B']):
            if fmt._strlen(value) + 1 > max_len:
                self.assert_('...' in line)
            else:
                self.assert_('...' not in line)

        fmt.print_config.max_colwidth = None
        self.assert_('...' not in repr(df))

        fmt.print_config.max_colwidth = max_len + 2
        self.assert_('...' not in repr(df))
Example #41
0
    def test_repr_truncation(self):
        max_len = 20
        fmt.print_config.max_colwidth = max_len
        df = DataFrame({'A': np.random.randn(10),
                 'B': [tm.rands(np.random.randint(max_len - 1,
                     max_len + 1)) for i in range(10)]})
        r = repr(df)
        r = r[r.find('\n') + 1:]
        for line, value in zip(r.split('\n'), df['B']):
            if fmt._strlen(value) + 1 > max_len:
                self.assert_('...' in line)
            else:
                self.assert_('...' not in line)

        fmt.print_config.max_colwidth = None
        self.assert_('...' not in repr(df))

        fmt.print_config.max_colwidth = max_len + 2
        self.assert_('...' not in repr(df))
Example #42
0
    def test_wide_repr_multiindex_cols(self):
        with option_context("mode.sim_interactive", True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), np.array(col(10, 5))])
            mcols = pandas.MultiIndex.from_arrays([np.array(col(20, 3)), np.array(col(20, 3))])
            df = DataFrame([col(20, 25) for _ in range(10)], index=midx, columns=mcols)
            df.index.names = ["Level 0", "Level 1"]
            set_option("display.expand_frame_repr", False)
            rep_str = repr(df)
            set_option("display.expand_frame_repr", True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

        with option_context("display.line_width", 120):
            wider_repr = repr(df)
            self.assert_(len(wider_repr) < len(wide_repr))
            self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1)

        reset_option("display.expand_frame_repr")
Example #43
0
    def test_wide_repr_multiindex(self):
        with option_context("mode.sim_interactive", True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), np.array(col(10, 5))])
            df = DataFrame([col(20, 25) for _ in range(10)], index=midx)
            df.index.names = ["Level 0", "Level 1"]
            set_option("print.expand_frame_repr", False)
            rep_str = repr(df)
            set_option("print.expand_frame_repr", True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

            with option_context("print.line_width", 120):
                wider_repr = repr(df)
                self.assert_(len(wider_repr) < len(wide_repr))

            for line in wide_repr.splitlines()[1::13]:
                self.assert_("Level 0 Level 1" in line)

        reset_option("print.expand_frame_repr")
Example #44
0
    def test_wide_repr_named(self):
        with option_context('mode.sim_interactive', True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            df = DataFrame([col(20, 25) for _ in range(10)])
            df.index.name = 'DataFrame Index'
            set_option('display.expand_frame_repr', False)

            rep_str = repr(df)
            set_option('display.expand_frame_repr', True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

            with option_context('display.line_width', 120):
                wider_repr = repr(df)
                self.assert_(len(wider_repr) < len(wide_repr))

            for line in wide_repr.splitlines()[1::13]:
                self.assert_('DataFrame Index' in line)

        reset_option('display.expand_frame_repr')
Example #45
0
    def test_wide_repr_named(self):
        with option_context('mode.sim_interactive', True):
            col = lambda l, k: [tm.rands(k) for _ in xrange(l)]
            df = DataFrame([col(20, 25) for _ in range(10)])
            df.index.name = 'DataFrame Index'
            set_option('display.expand_frame_repr', False)

            rep_str = repr(df)
            set_option('display.expand_frame_repr', True)
            wide_repr = repr(df)
            self.assert_(rep_str != wide_repr)

            with option_context('display.line_width', 120):
                wider_repr = repr(df)
                self.assert_(len(wider_repr) < len(wide_repr))

            for line in wide_repr.splitlines()[1::13]:
                self.assert_('DataFrame Index' in line)

        reset_option('display.expand_frame_repr')
Example #46
0
def _test_dataframe(size=10000, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'uint8': _random_integers(size, np.uint8),
        'uint16': _random_integers(size, np.uint16),
        'uint32': _random_integers(size, np.uint32),
        'uint64': _random_integers(size, np.uint64),
        'int8': _random_integers(size, np.int8),
        'int16': _random_integers(size, np.int16),
        'int32': _random_integers(size, np.int32),
        'int64': _random_integers(size, np.int64),
        'float32': np.random.randn(size).astype(np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': [tm.rands(10) for i in range(size)],
        'all_none': [None] * size,
        'all_none_category': [None] * size
    })
    # TODO(PARQUET-1015)
    # df['all_none_category'] = df['all_none_category'].astype('category')
    return df
def _test_dataframe(size=10000, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'uint8': _random_integers(size, np.uint8),
        'uint16': _random_integers(size, np.uint16),
        'uint32': _random_integers(size, np.uint32),
        'uint64': _random_integers(size, np.uint64),
        'int8': _random_integers(size, np.int8),
        'int16': _random_integers(size, np.int16),
        'int32': _random_integers(size, np.int32),
        'int64': _random_integers(size, np.int64),
        'float32': np.random.randn(size).astype(np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': [tm.rands(10) for i in range(size)],
        'all_none': [None] * size,
        'all_none_category': [None] * size
    })
    # TODO(PARQUET-1015)
    # df['all_none_category'] = df['all_none_category'].astype('category')
    return df
Example #48
0
    def test_excel_roundtrip_indexname(self):
        _skip_if_no_xlrd()
        _skip_if_no_xlwt()

        path = '%s.xls' % tm.rands(10)

        df = DataFrame(np.random.randn(10, 4))
        df.index.name = 'foo'

        df.to_excel(path)

        xf = ExcelFile(path)
        result = xf.parse(xf.sheet_names[0], index_col=0)

        tm.assert_frame_equal(result, df)
        self.assertEqual(result.index.name, 'foo')

        try:
            os.remove(path)
        except os.error:
            pass
Example #49
0
    def test_excel_roundtrip_indexname(self):
        _skip_if_no_xlrd()
        _skip_if_no_xlwt()

        path = "%s.xls" % tm.rands(10)

        df = DataFrame(np.random.randn(10, 4))
        df.index.name = "foo"

        df.to_excel(path)

        xf = ExcelFile(path)
        result = xf.parse(xf.sheet_names[0], index_col=0)

        tm.assert_frame_equal(result, df)
        self.assertEqual(result.index.name, "foo")

        try:
            os.remove(path)
        except os.error:
            pass
Example #50
0
    def test_utf16_bom_skiprows(self):
        # #2298
        data = u("""skip this
skip this too
A\tB\tC
1\t2\t3
4\t5\t6""")

        data2 = u("""skip this
skip this too
A,B,C
1,2,3
4,5,6""")

        path = '__%s__.csv' % tm.rands(10)

        with tm.ensure_clean(path) as path:
            for sep, dat in [('\t', data), (',', data2)]:
                for enc in ['utf-16', 'utf-16le', 'utf-16be']:
                    bytes = dat.encode(enc)
                    with open(path, 'wb') as f:
                        f.write(bytes)

                    s = BytesIO(dat.encode('utf-8'))
                    if compat.PY3:
                        # somewhat False since the code never sees bytes
                        from io import TextIOWrapper
                        s = TextIOWrapper(s, encoding='utf-8')

                    result = self.read_csv(path,
                                           encoding=enc,
                                           skiprows=2,
                                           sep=sep)
                    expected = self.read_csv(s,
                                             encoding='utf-8',
                                             skiprows=2,
                                             sep=sep)
                    s.close()

                    tm.assert_frame_equal(result, expected)
Example #51
0
    def test_strided_data_import(self):
        cases = []

        columns = ['a', 'b', 'c']
        N, K = 100, 3
        random_numbers = np.random.randn(N, K).copy() * 100

        numeric_dtypes = [
            'i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f4', 'f8'
        ]

        for type_name in numeric_dtypes:
            cases.append(random_numbers.astype(type_name))

        # strings
        cases.append(
            np.array([tm.rands(10) for i in range(N * K)],
                     dtype=object).reshape(N, K).copy())

        # booleans
        boolean_objects = (np.array([True, False, True] * N,
                                    dtype=object).reshape(N, K).copy())

        # add some nulls, so dtype comes back as objects
        boolean_objects[5] = None
        cases.append(boolean_objects)

        cases.append(
            np.arange("2016-01-01T00:00:00.001", N * K,
                      dtype='datetime64[ms]').reshape(N, K).copy())

        strided_mask = (random_numbers > 0).astype(bool)[:, 0]

        for case in cases:
            df = pd.DataFrame(case, columns=columns)
            col = df['a']

            self._check_pandas_roundtrip(df)
            self._check_array_roundtrip(col)
            self._check_array_roundtrip(col, mask=strided_mask)
Example #52
0
    def test_repr_truncation(self):
        max_len = 20
        set_option("print.max_colwidth", max_len)
        df = DataFrame({'A': np.random.randn(10),
                 'B': [tm.rands(np.random.randint(max_len - 1,
                     max_len + 1)) for i in range(10)]})
        r = repr(df)
        r = r[r.find('\n') + 1:]

        _strlen = fmt._strlen_func()

        for line, value in zip(r.split('\n'), df['B']):
            if _strlen(value) + 1 > max_len:
                self.assert_('...' in line)
            else:
                self.assert_('...' not in line)

        set_option("print.max_colwidth", 999999)
        self.assert_('...' not in repr(df))

        set_option("print.max_colwidth", max_len + 2)
        self.assert_('...' not in repr(df))
    def test_strided_data_import(self):
        cases = []

        columns = ['a', 'b', 'c']
        N, K = 100, 3
        random_numbers = np.random.randn(N, K).copy() * 100

        numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
                          'f4', 'f8']

        for type_name in numeric_dtypes:
            cases.append(random_numbers.astype(type_name))

        # strings
        cases.append(np.array([tm.rands(10) for i in range(N * K)],
                              dtype=object)
                     .reshape(N, K).copy())

        # booleans
        boolean_objects = (np.array([True, False, True] * N, dtype=object)
                           .reshape(N, K).copy())

        # add some nulls, so dtype comes back as objects
        boolean_objects[5] = None
        cases.append(boolean_objects)

        cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
                               dtype='datetime64[ms]')
                     .reshape(N, K).copy())

        strided_mask = (random_numbers > 0).astype(bool)[:, 0]

        for case in cases:
            df = pd.DataFrame(case, columns=columns)
            col = df['a']

            self._check_pandas_roundtrip(df)
            self._check_array_roundtrip(col)
            self._check_array_roundtrip(col, mask=strided_mask)
Example #54
0
    def test_utf16_bom_skiprows(self):
        # #2298
        data = u(
            """skip this
skip this too
A\tB\tC
1\t2\t3
4\t5\t6"""
        )

        data2 = u(
            """skip this
skip this too
A,B,C
1,2,3
4,5,6"""
        )

        path = "__%s__.csv" % tm.rands(10)

        with tm.ensure_clean(path) as path:
            for sep, dat in [("\t", data), (",", data2)]:
                for enc in ["utf-16", "utf-16le", "utf-16be"]:
                    bytes = dat.encode(enc)
                    with open(path, "wb") as f:
                        f.write(bytes)

                    s = BytesIO(dat.encode("utf-8"))
                    if compat.PY3:
                        # somewhat False since the code never sees bytes
                        from io import TextIOWrapper

                        s = TextIOWrapper(s, encoding="utf-8")

                    result = self.read_csv(path, encoding=enc, skiprows=2, sep=sep)
                    expected = self.read_csv(s, encoding="utf-8", skiprows=2, sep=sep)
                    s.close()

                    tm.assert_frame_equal(result, expected)
Example #55
0
def get_random_path():
    return u'__%s__.pickle' % tm.rands(10)
Example #56
0
class TestSeriesDtypes:
    def test_dt64_series_astype_object(self):
        dt64ser = Series(date_range('20130101', periods=3))
        result = dt64ser.astype(object)
        assert isinstance(result.iloc[0], datetime)
        assert result.dtype == np.object_

    def test_td64_series_astype_object(self):
        tdser = Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]')
        result = tdser.astype(object)
        assert isinstance(result.iloc[0], timedelta)
        assert result.dtype == np.object_

    @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])
    def test_astype(self, dtype):
        s = Series(np.random.randn(5), name='foo')
        as_typed = s.astype(dtype)

        assert as_typed.dtype == dtype
        assert as_typed.name == s.name

    def test_asobject_deprecated(self):
        s = Series(np.random.randn(5), name='foo')
        with tm.assert_produces_warning(FutureWarning):
            o = s.asobject
        assert isinstance(o, np.ndarray)

    def test_dtype(self, datetime_series):

        assert datetime_series.dtype == np.dtype('float64')
        assert datetime_series.dtypes == np.dtype('float64')

        # GH 26705 - Assert .ftype is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert datetime_series.ftype == 'float64:dense'

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert datetime_series.ftypes == 'float64:dense'
        tm.assert_series_equal(datetime_series.get_dtype_counts(),
                               Series(1, ['float64']))
        # GH18243 - Assert .get_ftype_counts is deprecated
        with tm.assert_produces_warning(FutureWarning):
            tm.assert_series_equal(datetime_series.get_ftype_counts(),
                                   Series(1, ['float64:dense']))

    @pytest.mark.parametrize("value", [np.nan, np.inf])
    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    def test_astype_cast_nan_inf_int(self, dtype, value):
        # gh-14265: check NaN and inf raise error when converting to int
        msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer'
        s = Series([value])

        with pytest.raises(ValueError, match=msg):
            s.astype(dtype)

    @pytest.mark.parametrize("dtype", [int, np.int8, np.int64])
    def test_astype_cast_object_int_fail(self, dtype):
        arr = Series(["car", "house", "tree", "1"])
        msg = r"invalid literal for int\(\) with base 10: 'car'"
        with pytest.raises(ValueError, match=msg):
            arr.astype(dtype)

    def test_astype_cast_object_int(self):
        arr = Series(['1', '2', '3', '4'], dtype=object)
        result = arr.astype(int)

        tm.assert_series_equal(result, Series(np.arange(1, 5)))

    def test_astype_datetime(self):
        s = Series(iNaT, dtype='M8[ns]', index=range(5))

        s = s.astype('O')
        assert s.dtype == np.object_

        s = Series([datetime(2001, 1, 2, 0, 0)])

        s = s.astype('O')
        assert s.dtype == np.object_

        s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)])

        s[1] = np.nan
        assert s.dtype == 'M8[ns]'

        s = s.astype('O')
        assert s.dtype == np.object_

    def test_astype_datetime64tz(self):
        s = Series(date_range('20130101', periods=3, tz='US/Eastern'))

        # astype
        result = s.astype(object)
        expected = Series(s.astype(object), dtype=object)
        tm.assert_series_equal(result, expected)

        result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz)
        tm.assert_series_equal(result, s)

        # astype - object, preserves on construction
        result = Series(s.astype(object))
        expected = s.astype(object)
        tm.assert_series_equal(result, expected)

        # astype - datetime64[ns, tz]
        result = Series(s.values).astype('datetime64[ns, US/Eastern]')
        tm.assert_series_equal(result, s)

        result = Series(s.values).astype(s.dtype)
        tm.assert_series_equal(result, s)

        result = s.astype('datetime64[ns, CET]')
        expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET'))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", [str, np.str_])
    @pytest.mark.parametrize("series", [
        Series(
            [string.digits * 10,
             tm.rands(63),
             tm.rands(64),
             tm.rands(1000)]),
        Series([string.digits * 10,
                tm.rands(63),
                tm.rands(64), np.nan, 1.0])
    ])
    def test_astype_str_map(self, dtype, series):
        # see gh-4405
        result = series.astype(dtype)
        expected = series.map(str)
        tm.assert_series_equal(result, expected)

    def test_astype_str_cast(self):
        # see gh-9757
        ts = Series([Timestamp('2010-01-04 00:00:00')])
        s = ts.astype(str)

        expected = Series([str('2010-01-04')])
        tm.assert_series_equal(s, expected)

        ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')])
        s = ts.astype(str)

        expected = Series([str('2010-01-04 00:00:00-05:00')])
        tm.assert_series_equal(s, expected)

        td = Series([Timedelta(1, unit='d')])
        s = td.astype(str)

        expected = Series([str('1 days 00:00:00.000000000')])
        tm.assert_series_equal(s, expected)

    def test_astype_unicode(self):
        # see gh-7758: A bit of magic is required to set
        # default encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10,
                    tm.rands(63),
                    tm.rands(64),
                    tm.rands(1000)]),
            Series(['データーサイエンス、お前はもう死んでいる']),
        ]

        former_encoding = None

        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series(['野菜食べないとやばい'.encode("utf-8")]))

        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(str)
            tm.assert_series_equal(res, expec)

        # Restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)
            sys.setdefaultencoding(former_encoding)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # see gh-7271
        s = Series(range(0, 10, 2), name='abc')

        dt1 = dtype_class({'abc': str})
        result = s.astype(dt1)
        expected = Series(['0', '2', '4', '6', '8'], name='abc')
        tm.assert_series_equal(result, expected)

        dt2 = dtype_class({'abc': 'float64'})
        result = s.astype(dt2)
        expected = Series([0.0, 2.0, 4.0, 6.0, 8.0],
                          dtype='float64',
                          name='abc')
        tm.assert_series_equal(result, expected)

        dt3 = dtype_class({'abc': str, 'def': str})
        msg = ("Only the Series name can be used for the key in Series dtype"
               r" mappings\.")
        with pytest.raises(KeyError, match=msg):
            s.astype(dt3)

        dt4 = dtype_class({0: str})
        with pytest.raises(KeyError, match=msg):
            s.astype(dt4)

        # GH16717
        # if dtypes provided is empty, it should error
        dt5 = dtype_class({})
        with pytest.raises(KeyError, match=msg):
            s.astype(dt5)

    def test_astype_categories_deprecation(self):

        # deprecated 17636
        s = Series(['a', 'b', 'a'])
        expected = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = s.astype('category', categories=['a', 'b'], ordered=True)
        tm.assert_series_equal(result, expected)

    def test_astype_from_categorical(self):
        items = ["a", "b", "c", "a"]
        s = Series(items)
        exp = Series(Categorical(items))
        res = s.astype('category')
        tm.assert_series_equal(res, exp)

        items = [1, 2, 3, 1]
        s = Series(items)
        exp = Series(Categorical(items))
        res = s.astype('category')
        tm.assert_series_equal(res, exp)

        df = DataFrame({
            "cats": [1, 2, 3, 4, 5, 6],
            "vals": [1, 2, 3, 4, 5, 6]
        })
        cats = Categorical([1, 2, 3, 4, 5, 6])
        exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
        df["cats"] = df["cats"].astype("category")
        tm.assert_frame_equal(exp_df, df)

        df = DataFrame({
            "cats": ['a', 'b', 'b', 'a', 'a', 'd'],
            "vals": [1, 2, 3, 4, 5, 6]
        })
        cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
        exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
        df["cats"] = df["cats"].astype("category")
        tm.assert_frame_equal(exp_df, df)

        # with keywords
        lst = ["a", "b", "c", "a"]
        s = Series(lst)
        exp = Series(Categorical(lst, ordered=True))
        res = s.astype(CategoricalDtype(None, ordered=True))
        tm.assert_series_equal(res, exp)

        exp = Series(Categorical(lst, categories=list('abcdef'), ordered=True))
        res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
        tm.assert_series_equal(res, exp)

    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({'value': value})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)

        s = df['value_group']
        expected = s
        tm.assert_series_equal(s.astype('category'), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = (r"could not convert string to float|"
               r"invalid literal for float\(\)")
        with pytest.raises(ValueError, match=msg):
            s.astype('float64')

        cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        tm.assert_series_equal(cat.astype('str'), exp)
        s2 = Series(Categorical(['1', '2', '3', '4']))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype('int'), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)),
                                   np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name='value_group')
        cmp(s.astype('object'), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        # valid conversion
        for valid in [
                lambda x: x.astype('category'),
                lambda x: x.astype(CategoricalDtype()),
                lambda x: x.astype('object').astype('category'),
                lambda x: x.astype('object').astype(CategoricalDtype())
        ]:

            result = valid(s)
            # compare series values
            # internal .categories can't be compared because it is sorted
            tm.assert_series_equal(result, s, check_categorical=False)

        # invalid conversion (these are NOT a dtype)
        msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
               "Categorical'> for astype")
        for invalid in [
                lambda x: x.astype(Categorical),
                lambda x: x.astype('object').astype(Categorical)
        ]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)

    @pytest.mark.parametrize('name', [None, 'foo'])
    @pytest.mark.parametrize('dtype_ordered', [True, False])
    @pytest.mark.parametrize('series_ordered', [True, False])
    def test_astype_categorical_to_categorical(self, name, dtype_ordered,
                                               series_ordered):
        # GH 10696/18593
        s_data = list('abcaacbab')
        s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered)
        s = Series(s_data, dtype=s_dtype, name=name)

        # unspecified categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = s.astype(dtype)
        exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
        expected = Series(s_data, name=name, dtype=exp_dtype)
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = s.astype('category', ordered=dtype_ordered)
        tm.assert_series_equal(result, expected)

        # different categories
        dtype = CategoricalDtype(list('adc'), dtype_ordered)
        result = s.astype(dtype)
        expected = Series(s_data, name=name, dtype=dtype)
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = s.astype('category',
                              categories=list('adc'),
                              ordered=dtype_ordered)
        tm.assert_series_equal(result, expected)

        if dtype_ordered is False:
            # not specifying ordered, so only test once
            expected = s
            result = s.astype('category')
            tm.assert_series_equal(result, expected)

    def test_astype_categoricaldtype(self):
        s = Series(['a', 'b', 'a'])
        result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
        expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
        tm.assert_series_equal(result, expected)

        result = s.astype(CategoricalDtype(['a', 'b'], ordered=False))
        expected = Series(Categorical(['a', 'b', 'a'], ordered=False))
        tm.assert_series_equal(result, expected)

        result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False))
        expected = Series(
            Categorical(['a', 'b', 'a'],
                        categories=['a', 'b', 'c'],
                        ordered=False))
        tm.assert_series_equal(result, expected)
        tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))

    def test_astype_categoricaldtype_with_args(self):
        s = Series(['a', 'b'])
        type_ = CategoricalDtype(['a', 'b'])

        msg = (r"Cannot specify a CategoricalDtype and also `categories` or"
               r" `ordered`\. Use `dtype=CategoricalDtype\(categories,"
               r" ordered\)` instead\.")
        with pytest.raises(TypeError, match=msg):
            s.astype(type_, ordered=True)
        with pytest.raises(TypeError, match=msg):
            s.astype(type_, categories=['a', 'b'])
        with pytest.raises(TypeError, match=msg):
            s.astype(type_, categories=['a', 'b'], ordered=False)

    @pytest.mark.parametrize("dtype", [
        np.datetime64,
        np.timedelta64,
    ])
    def test_astype_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        data = [1]
        s = Series(data)

        msg = ((r"The '{dtype}' dtype has no unit\. "
                r"Please pass in '{dtype}\[ns\]' instead.").format(
                    dtype=dtype.__name__))
        with pytest.raises(ValueError, match=msg):
            s.astype(dtype)

    @pytest.mark.parametrize("dtype", np.typecodes['All'])
    def test_astype_empty_constructor_equality(self, dtype):
        # see gh-15524

        if dtype not in (
                "S",
                "V",  # poor support (if any) currently
                "M",
                "m"  # Generic timestamps raise a ValueError. Already tested.
        ):
            init_empty = Series([], dtype=dtype)
            as_type_empty = Series([]).astype(dtype)
            tm.assert_series_equal(init_empty, as_type_empty)

    @pytest.mark.filterwarnings('ignore::FutureWarning')
    def test_complex(self):
        # see gh-4819: complex access for ndarray compat
        a = np.arange(5, dtype=np.float64)
        b = Series(a + 4j * a)

        tm.assert_numpy_array_equal(a, np.real(b))
        tm.assert_numpy_array_equal(4 * a, np.imag(b))

        b.real = np.arange(5) + 5
        tm.assert_numpy_array_equal(a + 5, np.real(b))
        tm.assert_numpy_array_equal(4 * a, np.imag(b))

    def test_real_imag_deprecated(self):
        # GH 18262
        s = pd.Series([1])
        with tm.assert_produces_warning(FutureWarning):
            s.imag
            s.real

    def test_arg_for_errors_in_astype(self):
        # see gh-14878
        s = Series([1, 2, 3])

        msg = (r"Expected value of kwarg 'errors' to be one of \['raise',"
               r" 'ignore'\]\. Supplied value is 'False'")
        with pytest.raises(ValueError, match=msg):
            s.astype(np.float64, errors=False)

        s.astype(np.int8, errors='raise')

    def test_intercept_astype_object(self):
        series = Series(date_range('1/1/2000', periods=10))

        # This test no longer makes sense, as
        # Series is by default already M8[ns].
        expected = series.astype('object')

        df = DataFrame({'a': series, 'b': np.random.randn(len(series))})
        exp_dtypes = Series([np.dtype('datetime64[ns]'),
                             np.dtype('float64')],
                            index=['a', 'b'])
        tm.assert_series_equal(df.dtypes, exp_dtypes)

        result = df.values.squeeze()
        assert (result[:, 0] == expected.values).all()

        df = DataFrame({'a': series, 'b': ['foo'] * len(series)})

        result = df.values.squeeze()
        assert (result[:, 0] == expected.values).all()

    def test_series_to_categorical(self):
        # see gh-16524: test conversion of Series to Categorical
        series = Series(['a', 'b', 'c'])

        result = Series(series, dtype='category')
        expected = Series(['a', 'b', 'c'], dtype='category')

        tm.assert_series_equal(result, expected)

    def test_infer_objects_series(self):
        # GH 11221
        actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects()
        expected = Series([1, 2, 3])
        tm.assert_series_equal(actual, expected)

        actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects()
        expected = Series([1., 2., 3., np.nan])
        tm.assert_series_equal(actual, expected)

        # only soft conversions, unconvertable pass thru unchanged
        actual = (Series(np.array([1, 2, 3, None, 'a'],
                                  dtype='O')).infer_objects())
        expected = Series([1, 2, 3, None, 'a'])

        assert actual.dtype == 'object'
        tm.assert_series_equal(actual, expected)

    def test_is_homogeneous_type(self):
        assert Series()._is_homogeneous_type
        assert Series([1, 2])._is_homogeneous_type
        assert Series(pd.Categorical([1, 2]))._is_homogeneous_type

    @pytest.mark.parametrize("data", [
        pd.period_range("2000", periods=4),
        pd.IntervalIndex.from_breaks([1, 2, 3, 4])
    ])
    def test_values_compatibility(self, data):
        # https://github.com/pandas-dev/pandas/issues/23995
        result = pd.Series(data).values
        expected = np.array(data.astype(object))
        tm.assert_numpy_array_equal(result, expected)
Example #57
0
def get_random_path():
    return "__{}__.pickle".format(tm.rands(10))
Example #58
0
def get_random_path():
    return u'__%s__.pickle' % tm.rands(10)
Example #59
0
def test_rands():
    r = tm.rands(10)
    assert (len(r) == 10)
Example #60
0
 def setup(self, uniqueness, total):
     nunique = int(total * uniqueness)
     unique_values = [tm.rands(self.string_length) for i in range(nunique)]
     values = unique_values * (total // nunique)
     self.arr = pa.array(values, type=pa.string())
     self.table = pa.Table.from_arrays([self.arr], ['f0'])