Beispiel #1
0
    def test_combine_first(self):
        series = Series(common.makeIntIndex(20).astype(float),
                        index=common.makeIntIndex(20))

        series_copy = series * 2
        series_copy[::2] = np.NaN

        # nothing used from the input
        combined = series.combine_first(series_copy)

        self.assert_(np.array_equal(combined, series))

        # Holes filled from input
        combined = series_copy.combine_first(series)
        self.assert_(np.isfinite(combined).all())

        self.assert_(np.array_equal(combined[::2], series[::2]))
        self.assert_(np.array_equal(combined[1::2], series_copy[1::2]))

        # mixed types
        index = common.makeStringIndex(20)
        floats = Series(common.randn(20), index=index)
        strings = Series(common.makeStringIndex(10), index=index[::2])

        combined = strings.combine_first(floats)

        common.assert_dict_equal(strings, combined, compare_keys=False)
        common.assert_dict_equal(floats[1::2], combined, compare_keys=False)

        # corner case
        s = Series([1., 2, 3], index=[0, 1, 2])
        result = s.combine_first(Series([], index=[]))
        assert_series_equal(s, result)
Beispiel #2
0
    def setup(self, sort):
        level1 = tm.makeStringIndex(10).values
        level2 = tm.makeStringIndex(1000).values
        label1 = np.arange(10).repeat(1000)
        label2 = np.tile(np.arange(1000), 10)
        index2 = MultiIndex(levels=[level1, level2],
                            labels=[label1, label2])
        self.df_multi = DataFrame(np.random.randn(len(index2), 4),
                                  index=index2,
                                  columns=['A', 'B', 'C', 'D'])

        self.key1 = np.tile(level1.take(label1), 10)
        self.key2 = np.tile(level2.take(label2), 10)
        self.df = DataFrame({'data1': np.random.randn(100000),
                             'data2': np.random.randn(100000),
                             'key1': self.key1,
                             'key2': self.key2})

        self.df_key1 = DataFrame(np.random.randn(len(level1), 4),
                                 index=level1,
                                 columns=['A', 'B', 'C', 'D'])
        self.df_key2 = DataFrame(np.random.randn(len(level2), 4),
                                 index=level2,
                                 columns=['A', 'B', 'C', 'D'])

        shuf = np.arange(100000)
        np.random.shuffle(shuf)
        self.df_shuf = self.df.reindex(self.df.index[shuf])
    def test_combine_first(self):
        values = tm.makeIntIndex(20).values.astype(float)
        series = Series(values, index=tm.makeIntIndex(20))

        series_copy = series * 2
        series_copy[::2] = np.NaN

        # nothing used from the input
        combined = series.combine_first(series_copy)

        tm.assert_series_equal(combined, series)

        # Holes filled from input
        combined = series_copy.combine_first(series)
        assert np.isfinite(combined).all()

        tm.assert_series_equal(combined[::2], series[::2])
        tm.assert_series_equal(combined[1::2], series_copy[1::2])

        # mixed types
        index = tm.makeStringIndex(20)
        floats = Series(tm.randn(20), index=index)
        strings = Series(tm.makeStringIndex(10), index=index[::2])

        combined = strings.combine_first(floats)

        tm.assert_series_equal(strings, combined.loc[index[::2]])
        tm.assert_series_equal(floats[1::2].astype(object),
                               combined.loc[index[1::2]])

        # corner case
        s = Series([1., 2, 3], index=[0, 1, 2])
        result = s.combine_first(Series([], index=[]))
        assert_series_equal(s, result)
Beispiel #4
0
    def setup(self):
        N = 100000
        np.random.seed(1234)

        self.int_unique = pd.Int64Index(np.arange(N * 5))
        # cache is_unique
        self.int_unique.is_unique

        self.int = pd.Int64Index(np.arange(N).repeat(5))
        self.float = pd.Float64Index(np.random.randn(N).repeat(5))

        # Convenience naming.
        self.checked_add = pd.core.algorithms.checked_add_with_arr

        self.arr = np.arange(1000000)
        self.arrpos = np.arange(1000000)
        self.arrneg = np.arange(-1000000, 0)
        self.arrmixed = np.array([1, -1]).repeat(500000)
        self.strings = tm.makeStringIndex(100000)

        self.arr_nan = np.random.choice([True, False], size=1000000)
        self.arrmixed_nan = np.random.choice([True, False], size=1000000)

        # match
        self.uniques = tm.makeStringIndex(1000).values
        self.all = self.uniques.repeat(10)
Beispiel #5
0
 def setup(self):
     N = 10000
     K = 10
     key1 = tm.makeStringIndex(N).values.repeat(K)
     key2 = tm.makeStringIndex(N).values.repeat(K)
     col_array = np.vstack([key1, key2, np.random.randn(N * K)])
     col_array2 = col_array.copy()
     col_array2[:, :10000] = np.nan
     self.col_array_list = list(col_array)
Beispiel #6
0
 def setup(self):
     index = tm.makeStringIndex(1000)
     columns = tm.makeStringIndex(30)
     self.df = DataFrame(np.random.randn(1000, 30), index=index,
                         columns=columns)
     self.idx_scalar = index[100]
     self.col_scalar = columns[10]
     self.bool_indexer = self.df[self.col_scalar] > 0
     self.bool_obj_indexer = self.bool_indexer.astype(object)
Beispiel #7
0
 def setup(self):
     N, K = 5000, 50
     self.index = tm.makeStringIndex(N)
     self.columns = tm.makeStringIndex(K)
     frame = DataFrame(np.random.randn(N, K), index=self.index,
                       columns=self.columns)
     self.data = frame.to_dict()
     self.dict_list = frame.to_dict(orient='records')
     self.data2 = {i: {j: float(j) for j in range(100)}
                   for i in range(2000)}
Beispiel #8
0
 def setup(self, other_cols, sep, na_rep, na_frac):
     N = 10 ** 5
     mask_gen = lambda: np.random.choice([True, False], N,
                                         p=[1 - na_frac, na_frac])
     self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
     if other_cols == 0:
         # str.cat self-concatenates only for others=None
         self.others = None
     else:
         self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
                                  for i in range(other_cols)})
Beispiel #9
0
 def setup_cache(self):
     size = 10**6
     data = {'int64_small': Series(np.random.randint(0, 100, size=size)),
             'int64_large': Series(np.random.randint(0, 10000, size=size)),
             'object_small': Series(
                 tm.makeStringIndex(100).take(
                     np.random.randint(0, 100, size=size))),
             'object_large': Series(
                 tm.makeStringIndex(10000).take(
                     np.random.randint(0, 10000, size=size)))}
     return data
Beispiel #10
0
 def setup(self):
     np.random.seed(1234)
     N, K = 5000, 50
     self.index = tm.makeStringIndex(N)
     self.columns = tm.makeStringIndex(K)
     self.frame = DataFrame(np.random.randn(N, K),
                            index=self.index,
                            columns=self.columns)
     self.data = self.frame.to_dict()
     self.some_dict = list(self.data.values())[0]
     self.dict_list = self.frame.to_dict(orient='records')
     self.data2 = {i: {j: float(j) for j in range(100)}
                   for i in range(2000)}
Beispiel #11
0
    def setup(self):
        N = 100000

        self.df = pd.DataFrame(
            {'A': pd.Series(tm.makeStringIndex(100).take(
                np.random.randint(0, 100, size=N))),
             'B': pd.Series(tm.makeStringIndex(10000).take(
                 np.random.randint(0, 10000, size=N))),
             'D': np.random.randn(N),
             'E': np.arange(N),
             'F': pd.date_range('20110101', freq='s', periods=N),
             'G': pd.timedelta_range('1 day', freq='s', periods=N),
             })
        self.df['C'] = self.df['B'].astype('category')
        self.df.iloc[10:20] = np.nan
Beispiel #12
0
 def setup(self):
     rng = date_range(start='1/1/1970', periods=10000, freq='1min')
     self.df = DataFrame(np.random.rand(10000, 10), index=rng,
                         columns=range(10))
     self.df['foo'] = 'bar'
     self.rng_subset = Index(rng[::2])
     self.df2 = DataFrame(index=range(10000),
                          data=np.random.rand(10000, 30), columns=range(30))
     N = 5000
     K = 200
     level1 = tm.makeStringIndex(N).values.repeat(K)
     level2 = np.tile(tm.makeStringIndex(K).values, N)
     index = MultiIndex.from_arrays([level1, level2])
     self.s = Series(np.random.randn(N * K), index=index)
     self.s_subset = self.s[::2]
Beispiel #13
0
 def setup(self, sort, dtype):
     N = 10**5
     data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
             'uint': pd.UInt64Index(np.arange(N).repeat(5)),
             'float': pd.Float64Index(np.random.randn(N).repeat(5)),
             'string': tm.makeStringIndex(N).repeat(5)}
     self.idx = data[dtype]
Beispiel #14
0
 def setUp(self):
     self.strIndex = tm.makeStringIndex(100)
     self.dateIndex = tm.makeDateIndex(100)
     self.intIndex = tm.makeIntIndex(100)
     self.floatIndex = tm.makeFloatIndex(100)
     self.empty = Index([])
     self.tuples = Index(zip(["foo", "bar", "baz"], [1, 2, 3]))
Beispiel #15
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))

        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        self.assertIsNone(retval)
        self.assertEqual(buf.getvalue(), s)

        tm.assertIsInstance(s, compat.string_types)

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A': lambda x: '%.1f' % x})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Beispiel #16
0
 def setup(self, lines_orient):
     N = 10**5
     ncols = 5
     index = date_range('20000101', periods=N, freq='H')
     timedeltas = timedelta_range(start=1, periods=N, freq='s')
     datetimes = date_range(start=1, periods=N, freq='s')
     ints = np.random.randint(100000000, size=N)
     floats = np.random.randn(N)
     strings = tm.makeStringIndex(N)
     self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
     self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
     self.df_td_int_ts = DataFrame({'td_1': timedeltas,
                                    'td_2': timedeltas,
                                    'int_1': ints,
                                    'int_2': ints,
                                    'ts_1': datetimes,
                                    'ts_2': datetimes},
                                   index=index)
     self.df_int_floats = DataFrame({'int_1': ints,
                                     'int_2': ints,
                                     'int_3': ints,
                                     'float_1': floats,
                                     'float_2': floats,
                                     'float_3': floats},
                                    index=index)
     self.df_int_float_str = DataFrame({'int_1': ints,
                                        'int_2': ints,
                                        'float_1': floats,
                                        'float_2': floats,
                                        'str_1': strings,
                                        'str_2': strings},
                                       index=index)
Beispiel #17
0
    def setup_method(self, method):
        self.bool_index = tm.makeBoolIndex(10, name='a')
        self.int_index = tm.makeIntIndex(10, name='a')
        self.float_index = tm.makeFloatIndex(10, name='a')
        self.dt_index = tm.makeDateIndex(10, name='a')
        self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize(
            tz='US/Eastern')
        self.period_index = tm.makePeriodIndex(10, name='a')
        self.string_index = tm.makeStringIndex(10, name='a')
        self.unicode_index = tm.makeUnicodeIndex(10, name='a')

        arr = np.random.randn(10)
        self.bool_series = Series(arr, index=self.bool_index, name='a')
        self.int_series = Series(arr, index=self.int_index, name='a')
        self.float_series = Series(arr, index=self.float_index, name='a')
        self.dt_series = Series(arr, index=self.dt_index, name='a')
        self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True)
        self.period_series = Series(arr, index=self.period_index, name='a')
        self.string_series = Series(arr, index=self.string_index, name='a')
        self.unicode_series = Series(arr, index=self.unicode_index, name='a')

        types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string',
                 'unicode']
        self.indexes = [getattr(self, '{}_index'.format(t)) for t in types]
        self.series = [getattr(self, '{}_series'.format(t)) for t in types]
        self.objs = self.indexes + self.series
Beispiel #18
0
    def test_repr_mixed_big(self):
        # big mixed
        biggie = DataFrame({"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=lrange(200))
        biggie.loc[:20, "A"] = nan
        biggie.loc[:20, "B"] = nan

        foo = repr(biggie)  # noqa
 def setup(self):
     n, k = 200, 5000
     levels = [np.arange(n),
               tm.makeStringIndex(n).values,
               1000 + np.arange(n)]
     codes = [np.random.choice(n, (k * n)) for lev in levels]
     self.mi = MultiIndex(levels=levels, codes=codes)
Beispiel #20
0
    def setup(self):
        N = 25000
        index = tm.makeStringIndex(N)
        self.df = DataFrame({'float1': np.random.randn(N),
                             'float2': np.random.randn(N)},
                            index=index)
        self.df_mixed = DataFrame({'float1': np.random.randn(N),
                                   'float2': np.random.randn(N),
                                   'string1': ['foo'] * N,
                                   'bool1': [True] * N,
                                   'int1': np.random.randint(0, N, size=N)},
                                  index=index)
        self.df_wide = DataFrame(np.random.randn(N, 100))
        self.start_wide = self.df_wide.index[10000]
        self.stop_wide = self.df_wide.index[15000]
        self.df2 = DataFrame({'float1': np.random.randn(N),
                              'float2': np.random.randn(N)},
                             index=date_range('1/1/2000', periods=N))
        self.start = self.df2.index[10000]
        self.stop = self.df2.index[15000]
        self.df_wide2 = DataFrame(np.random.randn(N, 100),
                                  index=date_range('1/1/2000', periods=N))
        self.df_dc = DataFrame(np.random.randn(N, 10),
                               columns=['C%03d' % i for i in range(10)])

        self.fname = '__test__.h5'

        self.store = HDFStore(self.fname)
        self.store.put('fixed', self.df)
        self.store.put('fixed_mixed', self.df_mixed)
        self.store.append('table', self.df2)
        self.store.append('table_mixed', self.df_mixed)
        self.store.append('table_wide', self.df_wide)
        self.store.append('table_wide2', self.df_wide2)
Beispiel #21
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A' : randn(200),
                            'B' : tm.makeStringIndex(200)},
                            index=range(200))

        biggie['A'][:20] = nan
        biggie['B'][:20] = nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        self.assert_(retval is None)
        self.assertEqual(buf.getvalue(), s)

        self.assert_(isinstance(s, basestring))

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A' : lambda x: '%.1f' % x})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12,
                       float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Beispiel #22
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))

        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        assert retval is None
        assert buf.getvalue() == s

        assert isinstance(s, compat.string_types)

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A': lambda x: '{x:.1f}'.format(x=x)})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Beispiel #23
0
    def setup_method(self, method):
        super(TestIndex, self).setup_method(method)

        self.d = {
            'string': tm.makeStringIndex(100),
            'date': tm.makeDateIndex(100),
            'int': tm.makeIntIndex(100),
            'rng': tm.makeRangeIndex(100),
            'float': tm.makeFloatIndex(100),
            'empty': Index([]),
            'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
            'period': Index(period_range('2012-1-1', freq='M', periods=3)),
            'date2': Index(date_range('2013-01-1', periods=10)),
            'bdate': Index(bdate_range('2013-01-02', periods=10)),
            'cat': tm.makeCategoricalIndex(100),
            'interval': tm.makeIntervalIndex(100),
            'timedelta': tm.makeTimedeltaIndex(100, 'H')
        }

        self.mi = {
            'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
                                           ('foo', 'two'),
                                           ('qux', 'one'), ('qux', 'two')],
                                          names=['first', 'second']),
        }
Beispiel #24
0
 def setup(self):
     groups = tm.makeStringIndex(10).values
     self.left = DataFrame({'group': groups.repeat(5000),
                            'key': np.tile(np.arange(0, 10000, 2), 10),
                            'lvalue': np.random.randn(50000)})
     self.right = DataFrame({'key': np.arange(10000),
                             'rvalue': np.random.randn(10000)})
Beispiel #25
0
 def setup(self, index):
     N = 10**5
     indexes = {'string': tm.makeStringIndex(N),
                'datetime': date_range('1900', periods=N, freq='s')}
     index = indexes[index]
     self.s = Series(np.random.rand(N), index=index)
     self.lbl = index[80000]
Beispiel #26
0
    def setUp(self):
        self.bool_index = tm.makeBoolIndex(10, name='a')
        self.int_index = tm.makeIntIndex(10, name='a')
        self.float_index = tm.makeFloatIndex(10, name='a')
        self.dt_index = tm.makeDateIndex(10, name='a')
        self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize(
            tz='US/Eastern')
        self.period_index = tm.makePeriodIndex(10, name='a')
        self.string_index = tm.makeStringIndex(10, name='a')
        self.unicode_index = tm.makeUnicodeIndex(10, name='a')

        arr = np.random.randn(10)
        self.int_series = Series(arr, index=self.int_index, name='a')
        self.float_series = Series(arr, index=self.float_index, name='a')
        self.dt_series = Series(arr, index=self.dt_index, name='a')
        self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True)
        self.period_series = Series(arr, index=self.period_index, name='a')
        self.string_series = Series(arr, index=self.string_index, name='a')

        types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string',
                 'unicode']
        fmts = ["{0}_{1}".format(t, f)
                for t in types for f in ['index', 'series']]
        self.objs = [getattr(self, f)
                     for f in fmts if getattr(self, f, None) is not None]
Beispiel #27
0
    def test_invalid_index_types(self):

        # test all index types
        for i in [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)]:
            self.assertRaises(TypeError, lambda: infer_freq(i))

        for i in [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]:
            self.assertRaises(ValueError, lambda: infer_freq(i))
Beispiel #28
0
 def setup(self):
     n = 50000
     indices = tm.makeStringIndex(n)
     subsample_size = 40000
     self.x = Series(np.random.randn(n), indices)
     self.y = Series(np.random.randn(subsample_size),
                     index=np.random.choice(indices, subsample_size,
                                            replace=False))
Beispiel #29
0
 def setup(self, sort, dtype):
     N = 10**5
     data = {'int': pd.Int64Index(np.arange(N)),
             'uint': pd.UInt64Index(np.arange(N)),
             'float': pd.Float64Index(np.arange(N)),
             'string': tm.makeStringIndex(N)}
     self.idx = data[dtype]
     assert self.idx.is_unique
Beispiel #30
0
 def setup(self):
     self.fname = '__test__.msg'
     N = 100000
     C = 5
     self.df = DataFrame(np.random.randn(N, C),
                         columns=['float{}'.format(i) for i in range(C)],
                         index=date_range('20000101', periods=N, freq='H'))
     self.df['object'] = tm.makeStringIndex(N)
     self.df.to_msgpack(self.fname)
Beispiel #31
0
    def setup_method(self, method):
        super(TestIndex, self).setup_method(method)

        self.d = {
            'string': tm.makeStringIndex(100),
            'date': tm.makeDateIndex(100),
            'int': tm.makeIntIndex(100),
            'rng': tm.makeRangeIndex(100),
            'float': tm.makeFloatIndex(100),
            'empty': Index([]),
            'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
            'period': Index(period_range('2012-1-1', freq='M', periods=3)),
            'date2': Index(date_range('2013-01-1', periods=10)),
            'bdate': Index(bdate_range('2013-01-02', periods=10)),
            'cat': tm.makeCategoricalIndex(100)
        }

        self.mi = {
            'reg':
            MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
                                    ('foo', 'two'), ('qux', 'one'),
                                    ('qux', 'two')],
                                   names=['first', 'second']),
        }
Beispiel #32
0
    def setup(self):
        n1 = 400
        n2 = 250
        index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)],
                           labels=[np.repeat(range(n1), n2).tolist(),
                                   list(range(n2)) * n1],
                           names=['lev1', 'lev2'])
        arr = np.random.randn(n1 * n2, 3)
        arr[::10000, 0] = np.nan
        arr[1::10000, 1] = np.nan
        arr[2::10000, 2] = np.nan
        data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3'])
        self.df = data

        n = 20000
        self.df1 = DataFrame(np.random.randint(1, n, (n, 3)),
                             columns=['jim', 'joe', 'jolie'])
        self.df2 = self.df1.copy()
        self.df2['jim'] = self.df2['joe']

        self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)),
                             columns=['jim', 'joe', 'jolie'])
        self.df4 = self.df3.copy()
        self.df4['jim'] = self.df4['joe']
Beispiel #33
0
 def setup(self, repeats):
     N = 10 ** 5
     self.s = Series(tm.makeStringIndex(N))
     repeat = {"int": 1, "array": np.random.randint(1, 3, N)}
     self.values = repeat[repeats]
Beispiel #34
0
    def run_command(key):
        pywren.wrenlogging.default_config('INFO')
        logging.basicConfig(level=logging.DEBUG)
        logger = logging.getLogger(__name__)
        logger.info("before everything")
        logger.info(key)
        t_s = time.time()
        partition_num = key['partition_num']
        rounds = key['rounds']
        #em = key['em']
        taskId = key['taskId']
        appName = key['appName']
        #         partition_num = 1
        #         rounds = 8
        #         #em = key['em']
        #         taskId = 1
        #         appName = 'test-1'
        em = JiffyClient(host=key['em'])
        logger.info("berfore queue")
        data_ques, msg_que = open_or_create_jiffy_queues(
            em, appName, partition_num, 1, 'sender')
        logger.info("queue opend")

        for i in range(rounds):
            ###        dd = read_s3_table(key)

            msg = create_msg(rounds, taskId, i, partition_num)

            #########  create a table here to replace the input
            right_table = 100000
            indices = tm.makeStringIndex(right_table).values
            key = np.tile(indices[:right_table], 1)
            right = DataFrame({
                "key": key,
                "value": np.random.randn(right_table)
            })
            logger.info("Finish generating data")
            x = 0
            if i == rounds - 1:
                x = 1
            encoded = right.to_csv(sep="|", header=False,
                                   index=False).encode('utf-8')
            a = np.random.randint(1, 10, 500000)
            encoded = np.asarray(a).astype('S100').tobytes()
            # print(sys.getsizeof(encoded))

            data_path = "/" + appName + "/" + '01'
            test_que = em.open_or_create_queue(data_path, "local://tmp", 10, 1)
            logger.info("get encoded size" + str(sys.getsizeof(encoded)))
            ta = time.time()
            test_que.put(encoded)
            tb = time.time()
            logger.info("wirte takes" + str(tb - ta))
            logger.info("before get")
            obj = test_que.get()
            logger.info("get obj of size" + str(sys.getsizeof(obj)))
            tc = time.time()
            logger.info("get takes " + str(tc - tb))
#             data_ques[0].put(encoded)
#             logger.info("wirte finished")
#             logger.info("before get")
#             obj = data_ques[0].get()

#res = write_jiffy_partitions(right, ['key'], 'uniform', partition_num, data_ques, msg_que = msg_que, msg = msg, fin = x)

        t_f = time.time()
        #        share.append([t_s,t_f])

        return ([t_s, t_f])
Beispiel #35
0
 def setUp(self):
     self.strIndex = tm.makeStringIndex(100)
     self.dateIndex = tm.makeDateIndex(100)
     self.intIndex = tm.makeIntIndex(100)
     self.empty = Index([])
     self.tuples = Index(zip(['foo', 'bar', 'baz'], [1, 2, 3]))
Beispiel #36
0
 def setup(self, repeats):
     N = 10**5
     self.s = Series(tm.makeStringIndex(N))
     repeat = {'int': 1, 'array': np.random.randint(1, 3, N)}
     self.repeat = repeat[repeats]
Beispiel #37
0
 def setup(self, expand):
     self.s = Series(tm.makeStringIndex(10**5)).str.join('--')
Beispiel #38
0
 def setup(self, errors):
     N = 10000
     self.float = Series(np.random.randn(N))
     self.numstr = self.float.astype('str')
     self.str = Series(tm.makeStringIndex(N))
 def setup(self):
     self.s = Series(index=tm.makeStringIndex(10000))
Beispiel #40
0
class TestSeriesMisc:
    def test_scalarop_preserve_name(self, datetime_series):
        result = datetime_series * 2
        assert result.name == datetime_series.name

    def test_copy_name(self, datetime_series):
        result = datetime_series.copy()
        assert result.name == datetime_series.name

    def test_copy_index_name_checking(self, datetime_series):
        # don't want to be able to modify the index stored elsewhere after
        # making a copy

        datetime_series.index.name = None
        assert datetime_series.index.name is None
        assert datetime_series is datetime_series

        cp = datetime_series.copy()
        cp.index.name = "foo"
        printing.pprint_thing(datetime_series.index.name)
        assert datetime_series.index.name is None

    def test_append_preserve_name(self, datetime_series):
        result = datetime_series[:5].append(datetime_series[5:])
        assert result.name == datetime_series.name

    def test_binop_maybe_preserve_name(self, datetime_series):
        # names match, preserve
        result = datetime_series * datetime_series
        assert result.name == datetime_series.name
        result = datetime_series.mul(datetime_series)
        assert result.name == datetime_series.name

        result = datetime_series * datetime_series[:-2]
        assert result.name == datetime_series.name

        # names don't match, don't preserve
        cp = datetime_series.copy()
        cp.name = "something else"
        result = datetime_series + cp
        assert result.name is None
        result = datetime_series.add(cp)
        assert result.name is None

        ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"]
        ops = ops + ["r" + op for op in ops]
        for op in ops:
            # names match, preserve
            s = datetime_series.copy()
            result = getattr(s, op)(s)
            assert result.name == datetime_series.name

            # names don't match, don't preserve
            cp = datetime_series.copy()
            cp.name = "changed"
            result = getattr(s, op)(cp)
            assert result.name is None

    def test_combine_first_name(self, datetime_series):
        result = datetime_series.combine_first(datetime_series[:5])
        assert result.name == datetime_series.name

    def test_getitem_preserve_name(self, datetime_series):
        result = datetime_series[datetime_series > 0]
        assert result.name == datetime_series.name

        result = datetime_series[[0, 2, 4]]
        assert result.name == datetime_series.name

        result = datetime_series[5:10]
        assert result.name == datetime_series.name

    def test_pickle_datetimes(self, datetime_series):
        unp_ts = self._pickle_roundtrip(datetime_series)
        tm.assert_series_equal(unp_ts, datetime_series)

    def test_pickle_strings(self, string_series):
        unp_series = self._pickle_roundtrip(string_series)
        tm.assert_series_equal(unp_series, string_series)

    def _pickle_roundtrip(self, obj):

        with tm.ensure_clean() as path:
            obj.to_pickle(path)
            unpickled = pd.read_pickle(path)
            return unpickled

    def test_argsort_preserve_name(self, datetime_series):
        result = datetime_series.argsort()
        assert result.name == datetime_series.name

    def test_sort_index_name(self, datetime_series):
        result = datetime_series.sort_index(ascending=False)
        assert result.name == datetime_series.name

    def test_constructor_dict(self):
        d = {"a": 0.0, "b": 1.0, "c": 2.0}
        result = Series(d)
        expected = Series(d, index=sorted(d.keys()))
        tm.assert_series_equal(result, expected)

        result = Series(d, index=["b", "c", "d", "a"])
        expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"])
        tm.assert_series_equal(result, expected)

    def test_constructor_subclass_dict(self):
        data = tm.TestSubDict((x, 10.0 * x) for x in range(10))
        series = Series(data)
        expected = Series(dict(data.items()))
        tm.assert_series_equal(series, expected)

    def test_constructor_ordereddict(self):
        # GH3283
        data = OrderedDict(
            ("col{i}".format(i=i), np.random.random()) for i in range(12)
        )

        series = Series(data)
        expected = Series(list(data.values()), list(data.keys()))
        tm.assert_series_equal(series, expected)

        # Test with subclass
        class A(OrderedDict):
            pass

        series = Series(A(data))
        tm.assert_series_equal(series, expected)

    def test_constructor_dict_multiindex(self):
        d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}
        _d = sorted(d.items())
        result = Series(d)
        expected = Series(
            [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d])
        )
        tm.assert_series_equal(result, expected)

        d["z"] = 111.0
        _d.insert(0, ("z", d["z"]))
        result = Series(d)
        expected = Series(
            [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False)
        )
        result = result.reindex(index=expected.index)
        tm.assert_series_equal(result, expected)

    def test_constructor_dict_timedelta_index(self):
        # GH #12169 : Resample category data with timedelta index
        # construct Series from dict as data and TimedeltaIndex as index
        # will result NaN in result Series data
        expected = Series(
            data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s")
        )

        result = Series(
            data={
                pd.to_timedelta(0, unit="s"): "A",
                pd.to_timedelta(10, unit="s"): "B",
                pd.to_timedelta(20, unit="s"): "C",
            },
            index=pd.to_timedelta([0, 10, 20], unit="s"),
        )
        tm.assert_series_equal(result, expected)

    def test_sparse_accessor_updates_on_inplace(self):
        s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]")
        s.drop([0, 1], inplace=True)
        assert s.sparse.density == 1.0

    def test_tab_completion(self):
        # GH 9910
        s = Series(list("abcd"))
        # Series of str values should have .str but not .dt/.cat in __dir__
        assert "str" in dir(s)
        assert "dt" not in dir(s)
        assert "cat" not in dir(s)

        # similarly for .dt
        s = Series(date_range("1/1/2015", periods=5))
        assert "dt" in dir(s)
        assert "str" not in dir(s)
        assert "cat" not in dir(s)

        # Similarly for .cat, but with the twist that str and dt should be
        # there if the categories are of that type first cat and str.
        s = Series(list("abbcd"), dtype="category")
        assert "cat" in dir(s)
        assert "str" in dir(s)  # as it is a string categorical
        assert "dt" not in dir(s)

        # similar to cat and str
        s = Series(date_range("1/1/2015", periods=5)).astype("category")
        assert "cat" in dir(s)
        assert "str" not in dir(s)
        assert "dt" in dir(s)  # as it is a datetime categorical

    def test_tab_completion_with_categorical(self):
        # test the tab completion display
        ok_for_cat = [
            "categories",
            "codes",
            "ordered",
            "set_categories",
            "add_categories",
            "remove_categories",
            "rename_categories",
            "reorder_categories",
            "remove_unused_categories",
            "as_ordered",
            "as_unordered",
        ]

        def get_dir(s):
            results = [r for r in s.cat.__dir__() if not r.startswith("_")]
            return sorted(set(results))

        s = Series(list("aabbcde")).astype("category")
        results = get_dir(s)
        tm.assert_almost_equal(results, sorted(set(ok_for_cat)))

    @pytest.mark.parametrize(
        "index",
        [
            tm.makeUnicodeIndex(10),
            tm.makeStringIndex(10),
            tm.makeCategoricalIndex(10),
            Index(["foo", "bar", "baz"] * 2),
            tm.makeDateIndex(10),
            tm.makePeriodIndex(10),
            tm.makeTimedeltaIndex(10),
            tm.makeIntIndex(10),
            tm.makeUIntIndex(10),
            tm.makeIntIndex(10),
            tm.makeFloatIndex(10),
            Index([True, False]),
            Index(["a{}".format(i) for i in range(101)]),
            pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")),
            pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")),
        ],
    )
    def test_index_tab_completion(self, index):
        # dir contains string-like values of the Index.
        s = pd.Series(index=index)
        dir_s = dir(s)
        for i, x in enumerate(s.index.unique(level=0)):
            if i < 100:
                assert not isinstance(x, str) or not x.isidentifier() or x in dir_s
            else:
                assert x not in dir_s

    def test_not_hashable(self):
        s_empty = Series()
        s = Series([1])
        msg = "'Series' objects are mutable, thus they cannot be hashed"
        with pytest.raises(TypeError, match=msg):
            hash(s_empty)
        with pytest.raises(TypeError, match=msg):
            hash(s)

    def test_contains(self, datetime_series):
        tm.assert_contains_all(datetime_series.index, datetime_series)

    def test_iter_datetimes(self, datetime_series):
        for i, val in enumerate(datetime_series):
            assert val == datetime_series[i]

    def test_iter_strings(self, string_series):
        for i, val in enumerate(string_series):
            assert val == string_series[i]

    def test_keys(self, datetime_series):
        # HACK: By doing this in two stages, we avoid 2to3 wrapping the call
        # to .keys() in a list()
        getkeys = datetime_series.keys
        assert getkeys() is datetime_series.index

    def test_values(self, datetime_series):
        tm.assert_almost_equal(
            datetime_series.values, datetime_series, check_dtype=False
        )

    def test_iteritems_datetimes(self, datetime_series):
        for idx, val in datetime_series.iteritems():
            assert val == datetime_series[idx]

    def test_iteritems_strings(self, string_series):
        for idx, val in string_series.iteritems():
            assert val == string_series[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(string_series.iteritems(), "reverse")

    def test_items_datetimes(self, datetime_series):
        for idx, val in datetime_series.items():
            assert val == datetime_series[idx]

    def test_items_strings(self, string_series):
        for idx, val in string_series.items():
            assert val == string_series[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(string_series.items(), "reverse")

    def test_raise_on_info(self):
        s = Series(np.random.randn(10))
        msg = "'Series' object has no attribute 'info'"
        with pytest.raises(AttributeError, match=msg):
            s.info()

    def test_copy(self):

        for deep in [None, False, True]:
            s = Series(np.arange(10), dtype="float64")

            # default deep is True
            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[::2] = np.NaN

            if deep is None or deep is True:
                # Did not modify original Series
                assert np.isnan(s2[0])
                assert not np.isnan(s[0])
            else:
                # we DID modify the original Series
                assert np.isnan(s2[0])
                assert np.isnan(s[0])

    def test_copy_tzaware(self):
        # GH#11794
        # copy of tz-aware
        expected = Series([Timestamp("2012/01/01", tz="UTC")])
        expected2 = Series([Timestamp("1999/01/01", tz="UTC")])

        for deep in [None, False, True]:

            s = Series([Timestamp("2012/01/01", tz="UTC")])

            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[0] = pd.Timestamp("1999/01/01", tz="UTC")

            # default deep is True
            if deep is None or deep is True:
                # Did not modify original Series
                tm.assert_series_equal(s2, expected2)
                tm.assert_series_equal(s, expected)
            else:
                # we DID modify the original Series
                tm.assert_series_equal(s2, expected2)
                tm.assert_series_equal(s, expected2)

    def test_axis_alias(self):
        s = Series([1, 2, np.nan])
        tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index"))
        assert s.dropna().sum("rows") == 3
        assert s._get_axis_number("rows") == 0
        assert s._get_axis_name("rows") == "index"

    def test_class_axis(self):
        # https://github.com/pandas-dev/pandas/issues/18147
        # no exception and no empty docstring
        assert pydoc.getdoc(Series.index)

    def test_numpy_unique(self, datetime_series):
        # it works!
        np.unique(datetime_series)

    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(
            np.random.randn(1000, 3),
            columns=["A", "B", "C"],
            index=date_range("1/1/2000", periods=1000),
        )

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # .item()
        with tm.assert_produces_warning(FutureWarning):
            s = Series([1])
            result = s.item()
            assert result == 1
            assert s.item() == s.iloc[0]

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype="float64")
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F"))

        # compress
        # GH 6658
        s = Series([0, 1.0, -1], index=list("abc"))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.0], index=["b"]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Index(dtype=object) as the same as original
        exp = Series([], dtype="float64", index=Index([], dtype="object"))
        tm.assert_series_equal(result, exp)

        s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.0], index=[0.2]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Float64Index as the same as original
        exp = Series([], dtype="float64", index=Index([], dtype="float64"))
        tm.assert_series_equal(result, exp)

    def test_str_accessor_updates_on_inplace(self):
        s = pd.Series(list("abc"))
        s.drop([0], inplace=True)
        assert len(s.str.lower()) == 2

    def test_str_attribute(self):
        # GH9068
        methods = ["strip", "rstrip", "lstrip"]
        s = Series([" jack", "jill ", " jesse ", "frank"])
        for method in methods:
            expected = Series([getattr(str, method)(x) for x in s.values])
            tm.assert_series_equal(getattr(Series.str, method)(s.str), expected)

        # str accessor only valid with string values
        s = Series(range(5))
        with pytest.raises(AttributeError, match="only use .str accessor"):
            s.str.repeat(2)

    def test_empty_method(self):
        s_empty = pd.Series()
        assert s_empty.empty

        for full_series in [pd.Series([1]), pd.Series(index=[1])]:
            assert not full_series.empty

    def test_tab_complete_warning(self, ip):
        # https://github.com/pandas-dev/pandas/issues/16409
        pytest.importorskip("IPython", minversion="6.0.0")
        from IPython.core.completer import provisionalcompleter

        code = "import pandas as pd; s = pd.Series()"
        ip.run_code(code)
        with tm.assert_produces_warning(None):
            with provisionalcompleter("ignore"):
                list(ip.Completer.completions("s.", 1))

    def test_integer_series_size(self):
        # GH 25580
        s = Series(range(9))
        assert s.size == 9
        s = Series(range(9), dtype="Int64")
        assert s.size == 9

    def test_get_values_deprecation(self):
        s = Series(range(9))
        with tm.assert_produces_warning(FutureWarning):
            res = s.get_values()
        tm.assert_numpy_array_equal(res, s.values)
Beispiel #41
0
 def setup(self):
     n, k = 200, 5000
     levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
     codes = [np.random.choice(n, (k * n)) for lev in levels]
     self.mi = MultiIndex(levels=levels, codes=codes)
Beispiel #42
0
import numpy as np
import pytest

import pandas as pd
from pandas.core.indexes.api import Index, MultiIndex
import pandas.util.testing as tm


@pytest.fixture(params=[tm.makeUnicodeIndex(100),
                        tm.makeStringIndex(100),
                        tm.makeDateIndex(100),
                        tm.makePeriodIndex(100),
                        tm.makeTimedeltaIndex(100),
                        tm.makeIntIndex(100),
                        tm.makeUIntIndex(100),
                        tm.makeRangeIndex(100),
                        tm.makeFloatIndex(100),
                        Index([True, False]),
                        tm.makeCategoricalIndex(100),
                        Index([]),
                        MultiIndex.from_tuples(zip(
                            ['foo', 'bar', 'baz'], [1, 2, 3])),
                        Index([0, 0, 1, 1, 2, 2])],
                ids=lambda x: type(x).__name__)
def indices(request):
    return request.param


@pytest.fixture(params=[1, np.array(1, dtype=np.int64)])
def one(request):
    # zero-dim integer array behaves like an integer
Beispiel #43
0
class TestSeriesMisc(TestData, SharedWithSparse):

    series_klass = Series
    # SharedWithSparse tests use generic, series_klass-agnostic assertion
    _assert_series_equal = staticmethod(tm.assert_series_equal)

    def test_tab_completion(self):
        # GH 9910
        s = Series(list("abcd"))
        # Series of str values should have .str but not .dt/.cat in __dir__
        assert "str" in dir(s)
        assert "dt" not in dir(s)
        assert "cat" not in dir(s)

        # similarly for .dt
        s = Series(date_range("1/1/2015", periods=5))
        assert "dt" in dir(s)
        assert "str" not in dir(s)
        assert "cat" not in dir(s)

        # Similarly for .cat, but with the twist that str and dt should be
        # there if the categories are of that type first cat and str.
        s = Series(list("abbcd"), dtype="category")
        assert "cat" in dir(s)
        assert "str" in dir(s)  # as it is a string categorical
        assert "dt" not in dir(s)

        # similar to cat and str
        s = Series(date_range("1/1/2015", periods=5)).astype("category")
        assert "cat" in dir(s)
        assert "str" not in dir(s)
        assert "dt" in dir(s)  # as it is a datetime categorical

    def test_tab_completion_with_categorical(self):
        # test the tab completion display
        ok_for_cat = [
            "name",
            "index",
            "categorical",
            "categories",
            "codes",
            "ordered",
            "set_categories",
            "add_categories",
            "remove_categories",
            "rename_categories",
            "reorder_categories",
            "remove_unused_categories",
            "as_ordered",
            "as_unordered",
        ]

        def get_dir(s):
            results = [r for r in s.cat.__dir__() if not r.startswith("_")]
            return list(sorted(set(results)))

        s = Series(list("aabbcde")).astype("category")
        results = get_dir(s)
        tm.assert_almost_equal(results, list(sorted(set(ok_for_cat))))

    @pytest.mark.parametrize(
        "index",
        [
            tm.makeUnicodeIndex(10),
            tm.makeStringIndex(10),
            tm.makeCategoricalIndex(10),
            Index(["foo", "bar", "baz"] * 2),
            tm.makeDateIndex(10),
            tm.makePeriodIndex(10),
            tm.makeTimedeltaIndex(10),
            tm.makeIntIndex(10),
            tm.makeUIntIndex(10),
            tm.makeIntIndex(10),
            tm.makeFloatIndex(10),
            Index([True, False]),
            Index(["a{}".format(i) for i in range(101)]),
            pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")),
            pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")),
        ],
    )
    def test_index_tab_completion(self, index):
        # dir contains string-like values of the Index.
        s = pd.Series(index=index)
        dir_s = dir(s)
        for i, x in enumerate(s.index.unique(level=0)):
            if i < 100:
                assert not isinstance(
                    x, str) or not x.isidentifier() or x in dir_s
            else:
                assert x not in dir_s

    def test_not_hashable(self):
        s_empty = Series()
        s = Series([1])
        msg = "'Series' objects are mutable, thus they cannot be hashed"
        with pytest.raises(TypeError, match=msg):
            hash(s_empty)
        with pytest.raises(TypeError, match=msg):
            hash(s)

    def test_contains(self):
        tm.assert_contains_all(self.ts.index, self.ts)

    def test_iter(self):
        for i, val in enumerate(self.series):
            assert val == self.series[i]

        for i, val in enumerate(self.ts):
            assert val == self.ts[i]

    def test_keys(self):
        # HACK: By doing this in two stages, we avoid 2to3 wrapping the call
        # to .keys() in a list()
        getkeys = self.ts.keys
        assert getkeys() is self.ts.index

    def test_values(self):
        tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False)

    def test_iteritems(self):
        for idx, val in self.series.iteritems():
            assert val == self.series[idx]

        for idx, val in self.ts.iteritems():
            assert val == self.ts[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(self.series.iteritems(), "reverse")

    def test_items(self):
        for idx, val in self.series.items():
            assert val == self.series[idx]

        for idx, val in self.ts.items():
            assert val == self.ts[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(self.series.items(), "reverse")

    def test_raise_on_info(self):
        s = Series(np.random.randn(10))
        msg = "'Series' object has no attribute 'info'"
        with pytest.raises(AttributeError, match=msg):
            s.info()

    def test_copy(self):

        for deep in [None, False, True]:
            s = Series(np.arange(10), dtype="float64")

            # default deep is True
            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[::2] = np.NaN

            if deep is None or deep is True:
                # Did not modify original Series
                assert np.isnan(s2[0])
                assert not np.isnan(s[0])
            else:
                # we DID modify the original Series
                assert np.isnan(s2[0])
                assert np.isnan(s[0])

    def test_copy_tzaware(self):
        # GH#11794
        # copy of tz-aware
        expected = Series([Timestamp("2012/01/01", tz="UTC")])
        expected2 = Series([Timestamp("1999/01/01", tz="UTC")])

        for deep in [None, False, True]:

            s = Series([Timestamp("2012/01/01", tz="UTC")])

            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[0] = pd.Timestamp("1999/01/01", tz="UTC")

            # default deep is True
            if deep is None or deep is True:
                # Did not modify original Series
                assert_series_equal(s2, expected2)
                assert_series_equal(s, expected)
            else:
                # we DID modify the original Series
                assert_series_equal(s2, expected2)
                assert_series_equal(s, expected2)

    def test_axis_alias(self):
        s = Series([1, 2, np.nan])
        assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index"))
        assert s.dropna().sum("rows") == 3
        assert s._get_axis_number("rows") == 0
        assert s._get_axis_name("rows") == "index"

    def test_class_axis(self):
        # https://github.com/pandas-dev/pandas/issues/18147
        # no exception and no empty docstring
        assert pydoc.getdoc(Series.index)

    def test_numpy_unique(self):
        # it works!
        np.unique(self.ts)

    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(
            np.random.randn(1000, 3),
            columns=["A", "B", "C"],
            index=date_range("1/1/2000", periods=1000),
        )

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # .item()
        with tm.assert_produces_warning(FutureWarning):
            s = Series([1])
            result = s.item()
            assert result == 1
            assert s.item() == s.iloc[0]

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype="float64")
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F"))

        # compress
        # GH 6658
        s = Series([0, 1.0, -1], index=list("abc"))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.0], index=["b"]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Index(dtype=object) as the same as original
        exp = Series([], dtype="float64", index=Index([], dtype="object"))
        tm.assert_series_equal(result, exp)

        s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.0], index=[0.2]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Float64Index as the same as original
        exp = Series([], dtype="float64", index=Index([], dtype="float64"))
        tm.assert_series_equal(result, exp)

    def test_str_accessor_updates_on_inplace(self):
        s = pd.Series(list("abc"))
        s.drop([0], inplace=True)
        assert len(s.str.lower()) == 2

    def test_str_attribute(self):
        # GH9068
        methods = ["strip", "rstrip", "lstrip"]
        s = Series([" jack", "jill ", " jesse ", "frank"])
        for method in methods:
            expected = Series([getattr(str, method)(x) for x in s.values])
            assert_series_equal(getattr(Series.str, method)(s.str), expected)

        # str accessor only valid with string values
        s = Series(range(5))
        with pytest.raises(AttributeError, match="only use .str accessor"):
            s.str.repeat(2)

    def test_empty_method(self):
        s_empty = pd.Series()
        assert s_empty.empty

        for full_series in [pd.Series([1]), pd.Series(index=[1])]:
            assert not full_series.empty

    def test_tab_complete_warning(self, ip):
        # https://github.com/pandas-dev/pandas/issues/16409
        pytest.importorskip("IPython", minversion="6.0.0")
        from IPython.core.completer import provisionalcompleter

        code = "import pandas as pd; s = pd.Series()"
        ip.run_code(code)
        with tm.assert_produces_warning(None):
            with provisionalcompleter("ignore"):
                list(ip.Completer.completions("s.", 1))

    def test_integer_series_size(self):
        # GH 25580
        s = Series(range(9))
        assert s.size == 9
        s = Series(range(9), dtype="Int64")
        assert s.size == 9

    def test_get_values_deprecation(self):
        s = Series(range(9))
        with tm.assert_produces_warning(FutureWarning):
            res = s.get_values()
        tm.assert_numpy_array_equal(res, s.values)
Beispiel #44
0
 def setup(self):
     N = 10**4
     self.iterables = [tm.makeStringIndex(N), range(20)]
Beispiel #45
0
 def setup(self, regex):
     self.s = Series(tm.makeStringIndex(10**5))
Beispiel #46
0
    def run_command(key):
        pywren.wrenlogging.default_config('INFO')
        logging.basicConfig(level=logging.DEBUG)
        logger = logging.getLogger(__name__)
        logger.info("before everything")
        partition_num = key['partition_num']
        rounds = key['rounds']
        em = JiffyClient(host=key['em'])
        reduceId = key['taskId']
        appName = key['appName']
        alg_type = key['type']
        data_ques1 = open_or_create_jiffy_queues(em, appName, partition_num, 1,
                                                 'receiver')
        logger.info("queue opened")
        names = key['names']
        dtypes = key['dtypes']
        ############# left table
        left_table = 100000
        indices = tm.makeStringIndex(left_table).values
        key = np.tile(indices[:left_table], 1)
        left = DataFrame({"key": key, "value": np.random.randn(left_table)})
        t_start = time.time()
        ############### initialize join functions
        # print(left)
        lim = 0
        ############## keeps fetching
        fin_num = 0

        if alg_type == 'pipelined':
            leftsorter = None
            leftcount = None
            orizer = None
            intrizer = None
            count = 0

            while fin_num < partition_num and lim < 15:
                #### read table
                lim += 1
                time.sleep(0.01)
                logger.info("before get")
                obj = data_ques1[0].get()
                if sys.getsizeof(obj) > 1000:
                    part_data = pd.read_table(BytesIO(obj),
                                              header=None,
                                              delimiter="|",
                                              names=['key', 'value2'])

            #    ds, fin_num = read_jiffy_splits(names, dtypes, reduceId, data_ques1, fin_num, batch_size = 1, fin_size = partition_num)
                logger.info(ds)
                logger.info(fin_num)
                #                 print(fin_num)
                if len(ds) > 0:
                    ### join
                    #                 start = timeit.default_timer()
                    result, orizer, intrizer, leftsorter, leftcount = pipeline_merge(
                        left,
                        ds,
                        factorizer=orizer,
                        intfactorizer=intrizer,
                        leftsorter=leftsorter,
                        leftcount=leftcount,
                        slices=8,
                        how="pipeline")
                    time.sleep(0.8)
                    logger.info("merged")

    #                 end = timeit.default_timer()
    #                 count += (end - start)
    #                 logger.info(str(i) + " chunks take time " +  str(end - start) + " Accum time: " + str(count))

        elif alg_type == 'origin':
            ds = pd.DataFrame()
            while fin_num < partition_num and lim < 1500:
                lim += 1
                #### read table
                dd, fin_num = read_jiffy_splits(names,
                                                dtypes,
                                                reduceId,
                                                data_ques1,
                                                fin_num,
                                                batch_size=1,
                                                fin_size=partition_num)
                if len(dd) > 0:

                    ds = ds.append(dd)
                print("this is ds:")
                print(ds)
                result = merge(left, ds, how="inner")
                print(fin_num)
        t_fin = time.time()
        #         share.append([t_start,t_fin, fin_num])
        return ([t_fin, t_start])
Beispiel #47
0
 def setup(self):
     self.s = Series(tm.makeStringIndex(10**5)).str.join('|')
Beispiel #48
0
 def setup(self):
     self.uniques = tm.makeStringIndex(1000).values
     self.all = self.uniques.repeat(10)
Beispiel #49
0
def test_duplicates(idx):
    assert not idx.has_duplicates
    assert idx.append(idx).has_duplicates

    index = MultiIndex(levels=[[0, 1], [0, 1, 2]],
                       labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]])
    assert index.has_duplicates

    # GH 9075
    t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169),
         (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119),
         (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135),
         (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145),
         (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158),
         (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122),
         (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160),
         (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180),
         (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143),
         (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128),
         (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129),
         (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111),
         (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114),
         (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121),
         (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126),
         (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155),
         (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123),
         (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)]

    index = pd.MultiIndex.from_tuples(t)
    assert not index.has_duplicates

    # handle int64 overflow if possible
    def check(nlevels, with_nulls):
        labels = np.tile(np.arange(500), 2)
        level = np.arange(500)

        if with_nulls:  # inject some null values
            labels[500] = -1  # common nan value
            labels = [labels.copy() for i in range(nlevels)]
            for i in range(nlevels):
                labels[i][500 + i - nlevels // 2] = -1

            labels += [np.array([-1, 1]).repeat(500)]
        else:
            labels = [labels] * nlevels + [np.arange(2).repeat(500)]

        levels = [level] * nlevels + [[0, 1]]

        # no dups
        index = MultiIndex(levels=levels, labels=labels)
        assert not index.has_duplicates

        # with a dup
        if with_nulls:

            def f(a):
                return np.insert(a, 1000, a[0])

            labels = list(map(f, labels))
            index = MultiIndex(levels=levels, labels=labels)
        else:
            values = index.values.tolist()
            index = MultiIndex.from_tuples(values + [values[0]])

        assert index.has_duplicates

    # no overflow
    check(4, False)
    check(4, True)

    # overflow possible
    check(8, False)
    check(8, True)

    # GH 9125
    n, k = 200, 5000
    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
    labels = [np.random.choice(n, k * n) for lev in levels]
    mi = MultiIndex(levels=levels, labels=labels)

    for keep in ['first', 'last', False]:
        left = mi.duplicated(keep=keep)
        right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep)
        tm.assert_numpy_array_equal(left, right)

    # GH5873
    for a in [101, 102]:
        mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
        assert not mi.has_duplicates

        with warnings.catch_warnings(record=True):
            # Deprecated - see GH20239
            assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))

        tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool'))

    for n in range(1, 6):  # 1st level shape
        for m in range(1, 5):  # 2nd level shape
            # all possible unique combinations, including nan
            lab = product(range(-1, n), range(-1, m))
            mi = MultiIndex(levels=[list('abcde')[:n],
                                    list('WXYZ')[:m]],
                            labels=np.random.permutation(list(lab)).T)
            assert len(mi) == (n + 1) * (m + 1)
            assert not mi.has_duplicates

            with warnings.catch_warnings(record=True):
                # Deprecated - see GH20239
                assert mi.get_duplicates().equals(
                    MultiIndex.from_arrays([[], []]))

            tm.assert_numpy_array_equal(mi.duplicated(),
                                        np.zeros(len(mi), dtype='bool'))
Beispiel #50
0
 def setup(self):
     N = 10000
     K = 10
     self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K),
                          'key2': tm.makeStringIndex(N).values.repeat(K),
                          'value': np.random.randn(N * K)})
Beispiel #51
0
 def setUp(self):
     self.strIndex = common.makeStringIndex(100)
     self.dateIndex = common.makeDateIndex(100)
     self.intIndex = common.makeIntIndex(100)
Beispiel #52
0
import string

import numpy as np

from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof
from pandas import pipeline_merge
import pandas.util.testing as tm

try:
    from pandas import merge_ordered
except ImportError:
    from pandas import ordered_merge as merge_ordered

N = 10000000
pieces = 10
indices = tm.makeStringIndex(N).values
indices2 = tm.makeStringIndex(N).values
key = np.tile(indices[:500000], 1)
key2 = np.tile(indices2[:500000], 1)
left = DataFrame({"key": key, "value": np.random.randn(500000)})
right = {}
right = DataFrame({
    "key": indices[1 * 100000 + 50000:9 * 100000 + 50000],
    "value2": np.random.randn(800000),
})
result = pipeline_merge(left, right, how="pipeline")

#    def time_merge_dataframe_integer_2key(self, sort):
#        pipeline_merge(self.df, self.df3, how="pipeline")
#
#    def time_merge_dataframe_integer_key(self, sort):
Beispiel #53
0
class TestSeriesMisc(TestData, SharedWithSparse):

    series_klass = Series
    # SharedWithSparse tests use generic, series_klass-agnostic assertion
    _assert_series_equal = staticmethod(tm.assert_series_equal)

    def test_tab_completion(self):
        # GH 9910
        s = Series(list('abcd'))
        # Series of str values should have .str but not .dt/.cat in __dir__
        assert 'str' in dir(s)
        assert 'dt' not in dir(s)
        assert 'cat' not in dir(s)

        # similarly for .dt
        s = Series(date_range('1/1/2015', periods=5))
        assert 'dt' in dir(s)
        assert 'str' not in dir(s)
        assert 'cat' not in dir(s)

        # Similarly for .cat, but with the twist that str and dt should be
        # there if the categories are of that type first cat and str.
        s = Series(list('abbcd'), dtype="category")
        assert 'cat' in dir(s)
        assert 'str' in dir(s)  # as it is a string categorical
        assert 'dt' not in dir(s)

        # similar to cat and str
        s = Series(date_range('1/1/2015', periods=5)).astype("category")
        assert 'cat' in dir(s)
        assert 'str' not in dir(s)
        assert 'dt' in dir(s)  # as it is a datetime categorical

    def test_tab_completion_with_categorical(self):
        # test the tab completion display
        ok_for_cat = [
            'categories', 'codes', 'ordered', 'set_categories',
            'add_categories', 'remove_categories', 'rename_categories',
            'reorder_categories', 'remove_unused_categories', 'as_ordered',
            'as_unordered'
        ]

        def get_dir(s):
            results = [r for r in s.cat.__dir__() if not r.startswith('_')]
            return list(sorted(set(results)))

        s = Series(list('aabbcde')).astype('category')
        results = get_dir(s)
        tm.assert_almost_equal(results, list(sorted(set(ok_for_cat))))

    @pytest.mark.parametrize("index", [
        tm.makeUnicodeIndex(10),
        tm.makeStringIndex(10),
        tm.makeCategoricalIndex(10),
        Index(['foo', 'bar', 'baz'] * 2),
        tm.makeDateIndex(10),
        tm.makePeriodIndex(10),
        tm.makeTimedeltaIndex(10),
        tm.makeIntIndex(10),
        tm.makeUIntIndex(10),
        tm.makeIntIndex(10),
        tm.makeFloatIndex(10),
        Index([True, False]),
        Index(['a{}'.format(i) for i in range(101)]),
        pd.MultiIndex.from_tuples(lzip('ABCD', 'EFGH')),
        pd.MultiIndex.from_tuples(lzip([0, 1, 2, 3], 'EFGH')),
    ])
    def test_index_tab_completion(self, index):
        # dir contains string-like values of the Index.
        s = pd.Series(index=index)
        dir_s = dir(s)
        for i, x in enumerate(s.index.unique(level=0)):
            if i < 100:
                assert (not isinstance(x, string_types) or not isidentifier(x)
                        or x in dir_s)
            else:
                assert x not in dir_s

    def test_not_hashable(self):
        s_empty = Series()
        s = Series([1])
        pytest.raises(TypeError, hash, s_empty)
        pytest.raises(TypeError, hash, s)

    def test_contains(self):
        tm.assert_contains_all(self.ts.index, self.ts)

    def test_iter(self):
        for i, val in enumerate(self.series):
            assert val == self.series[i]

        for i, val in enumerate(self.ts):
            assert val == self.ts[i]

    def test_keys(self):
        # HACK: By doing this in two stages, we avoid 2to3 wrapping the call
        # to .keys() in a list()
        getkeys = self.ts.keys
        assert getkeys() is self.ts.index

    def test_values(self):
        tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False)

    def test_iteritems(self):
        for idx, val in compat.iteritems(self.series):
            assert val == self.series[idx]

        for idx, val in compat.iteritems(self.ts):
            assert val == self.ts[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(self.series.iteritems(), 'reverse')

    def test_items(self):
        for idx, val in self.series.items():
            assert val == self.series[idx]

        for idx, val in self.ts.items():
            assert val == self.ts[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(self.series.items(), 'reverse')

    def test_raise_on_info(self):
        s = Series(np.random.randn(10))
        with pytest.raises(AttributeError):
            s.info()

    def test_copy(self):

        for deep in [None, False, True]:
            s = Series(np.arange(10), dtype='float64')

            # default deep is True
            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[::2] = np.NaN

            if deep is None or deep is True:
                # Did not modify original Series
                assert np.isnan(s2[0])
                assert not np.isnan(s[0])
            else:
                # we DID modify the original Series
                assert np.isnan(s2[0])
                assert np.isnan(s[0])

        # GH 11794
        # copy of tz-aware
        expected = Series([Timestamp('2012/01/01', tz='UTC')])
        expected2 = Series([Timestamp('1999/01/01', tz='UTC')])

        for deep in [None, False, True]:

            s = Series([Timestamp('2012/01/01', tz='UTC')])

            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[0] = pd.Timestamp('1999/01/01', tz='UTC')

            # default deep is True
            if deep is None or deep is True:
                # Did not modify original Series
                assert_series_equal(s2, expected2)
                assert_series_equal(s, expected)
            else:
                # we DID modify the original Series
                assert_series_equal(s2, expected2)
                assert_series_equal(s, expected2)

    def test_axis_alias(self):
        s = Series([1, 2, np.nan])
        assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index'))
        assert s.dropna().sum('rows') == 3
        assert s._get_axis_number('rows') == 0
        assert s._get_axis_name('rows') == 'index'

    def test_class_axis(self):
        # https://github.com/pandas-dev/pandas/issues/18147
        # no exception and no empty docstring
        assert pydoc.getdoc(Series.index)

    def test_numpy_unique(self):
        # it works!
        np.unique(self.ts)

    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(np.random.randn(1000, 3),
                         columns=['A', 'B', 'C'],
                         index=date_range('1/1/2000', periods=1000))

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # .item()
        s = Series([1])
        result = s.item()
        assert result == 1
        assert s.item() == s.iloc[0]

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype='float64')
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F'))

        # compress
        # GH 6658
        s = Series([0, 1., -1], index=list('abc'))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.], index=['b']))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Index(dtype=object) as the same as original
        exp = Series([], dtype='float64', index=Index([], dtype='object'))
        tm.assert_series_equal(result, exp)

        s = Series([0, 1., -1], index=[.1, .2, .3])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.], index=[.2]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Float64Index as the same as original
        exp = Series([], dtype='float64', index=Index([], dtype='float64'))
        tm.assert_series_equal(result, exp)

    def test_str_attribute(self):
        # GH9068
        methods = ['strip', 'rstrip', 'lstrip']
        s = Series([' jack', 'jill ', ' jesse ', 'frank'])
        for method in methods:
            expected = Series([getattr(str, method)(x) for x in s.values])
            assert_series_equal(getattr(Series.str, method)(s.str), expected)

        # str accessor only valid with string values
        s = Series(range(5))
        with tm.assert_raises_regex(AttributeError, 'only use .str accessor'):
            s.str.repeat(2)

    def test_empty_method(self):
        s_empty = pd.Series()
        assert s_empty.empty

        for full_series in [pd.Series([1]), pd.Series(index=[1])]:
            assert not full_series.empty

    def test_tab_complete_warning(self, ip):
        # https://github.com/pandas-dev/pandas/issues/16409
        pytest.importorskip('IPython', minversion="6.0.0")
        from IPython.core.completer import provisionalcompleter

        code = "import pandas as pd; s = pd.Series()"
        ip.run_code(code)
        with tm.assert_produces_warning(None):
            with provisionalcompleter('ignore'):
                list(ip.Completer.completions('s.', 1))
Beispiel #54
0
    def setup_method(self, method):
        self.bool_index = tm.makeBoolIndex(10, name="a")
        self.int_index = tm.makeIntIndex(10, name="a")
        self.float_index = tm.makeFloatIndex(10, name="a")
        self.dt_index = tm.makeDateIndex(10, name="a")
        self.dt_tz_index = tm.makeDateIndex(
            10, name="a").tz_localize(tz="US/Eastern")
        self.period_index = tm.makePeriodIndex(10, name="a")
        self.string_index = tm.makeStringIndex(10, name="a")
        self.unicode_index = tm.makeUnicodeIndex(10, name="a")

        arr = np.random.randn(10)
        self.bool_series = Series(arr, index=self.bool_index, name="a")
        self.int_series = Series(arr, index=self.int_index, name="a")
        self.float_series = Series(arr, index=self.float_index, name="a")
        self.dt_series = Series(arr, index=self.dt_index, name="a")
        self.dt_tz_series = self.dt_tz_index.to_series()
        self.period_series = Series(arr, index=self.period_index, name="a")
        self.string_series = Series(arr, index=self.string_index, name="a")
        self.unicode_series = Series(arr, index=self.unicode_index, name="a")

        types = [
            "bool", "int", "float", "dt", "dt_tz", "period", "string",
            "unicode"
        ]
        self.indexes = [getattr(self, "{}_index".format(t)) for t in types]
        self.series = [getattr(self, "{}_series".format(t)) for t in types]

        # To test narrow dtypes, we use narrower *data* elements, not *index* elements
        index = self.int_index
        self.float32_series = Series(arr.astype(np.float32),
                                     index=index,
                                     name="a")

        arr_int = np.random.choice(10, size=10, replace=False)
        self.int8_series = Series(arr_int.astype(np.int8),
                                  index=index,
                                  name="a")
        self.int16_series = Series(arr_int.astype(np.int16),
                                   index=index,
                                   name="a")
        self.int32_series = Series(arr_int.astype(np.int32),
                                   index=index,
                                   name="a")

        self.uint8_series = Series(arr_int.astype(np.uint8),
                                   index=index,
                                   name="a")
        self.uint16_series = Series(arr_int.astype(np.uint16),
                                    index=index,
                                    name="a")
        self.uint32_series = Series(arr_int.astype(np.uint32),
                                    index=index,
                                    name="a")

        nrw_types = [
            "float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"
        ]
        self.narrow_series = [
            getattr(self, "{}_series".format(t)) for t in nrw_types
        ]

        self.objs = self.indexes + self.series + self.narrow_series
Beispiel #55
0
 def setup(self):
     np.random.seed(1234)
     self.uniques = tm.makeStringIndex(1000).values
     self.all = self.uniques.repeat(10)
Beispiel #56
0
import numpy as np
import pytest

import pandas as pd
from pandas.core.indexes.api import Index, MultiIndex
import pandas.util.testing as tm

indices_dict = {
    "unicode": tm.makeUnicodeIndex(100),
    "string": tm.makeStringIndex(100),
    "datetime": tm.makeDateIndex(100),
    "period": tm.makePeriodIndex(100),
    "timedelta": tm.makeTimedeltaIndex(100),
    "int": tm.makeIntIndex(100),
    "uint": tm.makeUIntIndex(100),
    "range": tm.makeRangeIndex(100),
    "float": tm.makeFloatIndex(100),
    "bool": Index([True, False]),
    "categorical": tm.makeCategoricalIndex(100),
    "interval": tm.makeIntervalIndex(100),
    "empty": Index([]),
    "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])),
    "repeats": Index([0, 0, 1, 1, 2, 2]),
}


@pytest.fixture(params=indices_dict.keys())
def indices(request):
    # copy to avoid mutation, e.g. setting .name
    return indices_dict[request.param].copy()
Beispiel #57
0
    [tm.makeIntIndex(10),
     tm.makeFloatIndex(10),
     tm.makePeriodIndex(10)])
def test_invalid_index_types(idx):
    msg = ("(cannot infer freq from a non-convertible)|"
           "(Check the `freq` attribute instead of using infer_freq)")

    with pytest.raises(TypeError, match=msg):
        frequencies.infer_freq(idx)


@pytest.mark.skipif(is_platform_windows(),
                    reason="see gh-10822: Windows issue")
@pytest.mark.parametrize(
    "idx",
    [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)])
def test_invalid_index_types_unicode(idx):
    # see gh-10822
    #
    # Odd error message on conversions to datetime for unicode.
    msg = "Unknown string format"

    with pytest.raises(ValueError, match=msg):
        frequencies.infer_freq(idx)


def test_string_datetime_like_compat():
    # see gh-6463
    data = ["2004-01", "2004-02", "2004-03", "2004-04"]

    expected = frequencies.infer_freq(data)
Beispiel #58
0
 def setup(self, keep):
     N = 10**5
     self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
     self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
     self.string_idx = tm.makeStringIndex(N)