def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) series_copy = series * 2 series_copy[::2] = np.NaN # nothing used from the input combined = series.combine_first(series_copy) tm.assert_series_equal(combined, series) # Holes filled from input combined = series_copy.combine_first(series) assert np.isfinite(combined).all() tm.assert_series_equal(combined[::2], series[::2]) tm.assert_series_equal(combined[1::2], series_copy[1::2]) # mixed types index = tm.makeStringIndex(20) floats = Series(np.random.randn(20), index=index) strings = Series(tm.makeStringIndex(10), index=index[::2]) combined = strings.combine_first(floats) tm.assert_series_equal(strings, combined.loc[index[::2]]) tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) # corner case ser = Series([1.0, 2, 3], index=[0, 1, 2]) empty = Series([], index=[], dtype=object) result = ser.combine_first(empty) ser.index = ser.index.astype("O") tm.assert_series_equal(ser, result)
def setup(self): N = 10000 K = 10 self.df = DataFrame({ "key1": tm.makeStringIndex(N).values.repeat(K), "key2": tm.makeStringIndex(N).values.repeat(K), "value": np.random.randn(N * K), })
def setup(self): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) col_array = np.vstack([key1, key2, np.random.randn(N * K)]) col_array2 = col_array.copy() col_array2[:, :10000] = np.nan self.col_array_list = list(col_array)
def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) with warnings.catch_warnings(record=True): self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")
def setup(self, other_cols, sep, na_rep, na_frac): N = 10**5 mask_gen = lambda: np.random.choice( [True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: self.others = DataFrame({ i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols) })
def setup(self, dtype): N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) def create_df(data): return DataFrame(data, index=self.index, columns=self.columns) self.df_int = create_df(np.random.randint(low=100, size=(N, K))) self.df_float = create_df(np.random.randn(N, K)) self.df_bool = create_df(np.random.choice([True, False], size=(N, K))) self.df_string = create_df( np.random.choice(list(string.ascii_letters), size=(N, K)))
def setup(self): N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() self.dict_list = frame.to_dict(orient="records") self.data2 = { i: {j: float(j) for j in range(100)} for i in range(2000) }
def setup(self): rng = date_range(start="1/1/1970", periods=10000, freq="1min") self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) self.df["foo"] = "bar" self.rng_subset = Index(rng[::2]) self.df2 = DataFrame( index=range(10000), data=np.random.rand(10000, 30), columns=range(30) ) N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2]
def setup(self, dtype, method): N = 10**5 dates_left = date_range("1/1/2000", periods=N, freq="T") fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) str_left = tm.makeStringIndex(N) data = { "datetime": { "left": dates_left, "right": dates_left[:-1] }, "date_string": { "left": date_str_left, "right": date_str_left[:-1] }, "int": { "left": int_left, "right": int_left[:-1] }, "strings": { "left": str_left, "right": str_left[:-1] }, } self.left = data[dtype]["left"] self.right = data[dtype]["right"]
def setup(self): n1 = 400 n2 = 250 index = MultiIndex( levels=[np.arange(n1), tm.makeStringIndex(n2)], codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], names=["lev1", "lev2"], ) arr = np.random.randn(n1 * n2, 3) arr[::10000, 0] = np.nan arr[1::10000, 1] = np.nan arr[2::10000, 2] = np.nan data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"]) self.df = data n = 20000 self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"]) self.df2 = self.df1.copy() self.df2["jim"] = self.df2["joe"] self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), columns=["jim", "joe", "jolie"]) self.df4 = self.df3.copy() self.df4["jim"] = self.df4["joe"]
def setup(self): n, k = 200, 5000 levels = [ np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n) ] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes)
def test_invalid_index_types_unicode(): # see gh-10822 # # Odd error message on conversions to datetime for unicode. msg = "Unknown string format" with pytest.raises(ValueError, match=msg): frequencies.infer_freq(tm.makeStringIndex(10))
def setup_cache(self): size = 10**6 data = { "int64_small": Series(np.random.randint(0, 100, size=size)), "int64_large": Series(np.random.randint(0, 10000, size=size)), "object_small": Series( tm.makeStringIndex(100).take( np.random.randint(0, 100, size=size))), "object_large": Series( tm.makeStringIndex(10000).take( np.random.randint(0, 10000, size=size))), } return data
def setup(self): n = 50000 indices = tm.makeStringIndex(n) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) self.y = Series( np.random.randn(subsample_size), index=np.random.choice(indices, subsample_size, replace=False), )
def test_repr_mixed_big(self): # big mixed biggie = DataFrame( {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200) ) biggie.loc[:20, "A"] = np.nan biggie.loc[:20, "B"] = np.nan repr(biggie)
def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] codes = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, codes=codes) result = mi.duplicated(keep=keep) expected = hashtable.duplicated(mi.values, keep=keep) tm.assert_numpy_array_equal(result, expected)
def _generate_dataframe(): N = 2000 C = 5 df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) df["object"] = tm.makeStringIndex(N) return df
def setup(self, inplace): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) self.df = DataFrame( {"key1": key1, "key2": key2, "value": np.random.randn(N * K)} ) self.df_nan = self.df.copy() self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) N = 1000000 K = 10000 key1 = np.random.randint(0, K, size=N) self.df_int = DataFrame({"key1": key1}) self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool))
def test_sub_fail(self): index = tm.makeStringIndex(100) with pytest.raises(TypeError): index - "a" with pytest.raises(TypeError): index - index with pytest.raises(TypeError): index - index.tolist() with pytest.raises(TypeError): index.tolist() - index
def setup(self, format): self.fname = "__test__.h5" N = 100000 C = 5 self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) self.df.to_hdf(self.fname, "df", format=format)
def test_sub_fail(self): index = tm.makeStringIndex(100) msg = "unsupported operand type|Cannot broadcast" with pytest.raises(TypeError, match=msg): index - "a" with pytest.raises(TypeError, match=msg): index - index with pytest.raises(TypeError, match=msg): index - index.tolist() with pytest.raises(TypeError, match=msg): index.tolist() - index
def test_add(self): index = tm.makeStringIndex(100) expected = pd.Index(index.values * 2) tm.assert_index_equal(index + index, expected) tm.assert_index_equal(index + index.tolist(), expected) tm.assert_index_equal(index.tolist() + index, expected) # test add and radd index = pd.Index(list("abc")) expected = pd.Index(["a1", "b1", "c1"]) tm.assert_index_equal(index + "1", expected) expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected)
def biggie_df_fixture(request): """Fixture for a big mixed Dataframe and an empty Dataframe""" if request.param == "mixed": df = DataFrame( {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=np.arange(200), ) df.loc[:20, "A"] = np.nan df.loc[:20, "B"] = np.nan return df elif request.param == "empty": df = DataFrame(index=np.arange(200)) return df
def setup(self, skiprows): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( { "float1": np.random.randn(N), "float2": np.random.randn(N), "string1": ["foo"] * N, "bool1": [True] * N, "int1": np.random.randint(0, N, size=N), }, index=index, ) df.to_csv(self.fname)
def get_objs(): indexes = [ tm.makeBoolIndex(10, name="a"), tm.makeIntIndex(10, name="a"), tm.makeFloatIndex(10, name="a"), tm.makeDateIndex(10, name="a"), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), tm.makePeriodIndex(10, name="a"), tm.makeStringIndex(10, name="a"), ] arr = np.random.randn(10) series = [Series(arr, index=idx, name="a") for idx in indexes] objs = indexes + series return objs
def setup(self, threads): if not have_real_test_parallel: raise NotImplementedError strings = tm.makeStringIndex(100000) @test_parallel(num_threads=threads) def parallel(): factorize(strings) self.parallel = parallel def loop(): factorize(strings) self.loop = loop
def setup(self): N = 10000 self.table_name = "test" self.con = create_engine("sqlite:///:memory:") self.df = DataFrame( { "float": np.random.randn(N), "float_with_nan": np.random.randn(N), "string": ["foo"] * N, "bool": [True] * N, "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace")
def setup(self, orient, frame): N = 10 ** 5 ncols = 5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( { "td_1": timedeltas, "td_2": timedeltas, "int_1": ints, "int_2": ints, "ts_1": datetimes, "ts_2": datetimes, }, index=index, ) self.df_int_floats = DataFrame( { "int_1": ints, "int_2": ints, "int_3": ints, "float_1": floats, "float_2": floats, "float_3": floats, }, index=index, ) self.df_int_float_str = DataFrame( { "int_1": ints, "int_2": ints, "float_1": floats, "float_2": floats, "str_1": strings, "str_2": strings, }, index=index, )
def setup(self, index, index_structure): N = 10**6 if index == "string": index = tm.makeStringIndex(N) elif index == "datetime": index = date_range("1900", periods=N, freq="s") elif index == "period": index = period_range("1900", periods=N, freq="s") index = index.sort_values() assert index.is_unique and index.is_monotonic_increasing if index_structure == "nonunique_monotonic_inc": index = index.insert(item=index[2], loc=2)[:-1] elif index_structure == "non_monotonic": index = index[::2].append(index[1::2]) assert len(index) == N self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000] # warm up index mapping self.s[self.lbl]
def setup_method(self, method): self.bool_index = tm.makeBoolIndex(10, name="a") self.int_index = tm.makeIntIndex(10, name="a") self.float_index = tm.makeFloatIndex(10, name="a") self.dt_index = tm.makeDateIndex(10, name="a") self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern") self.period_index = tm.makePeriodIndex(10, name="a") self.string_index = tm.makeStringIndex(10, name="a") self.unicode_index = tm.makeUnicodeIndex(10, name="a") arr = np.random.randn(10) self.bool_series = Series(arr, index=self.bool_index, name="a") self.int_series = Series(arr, index=self.int_index, name="a") self.float_series = Series(arr, index=self.float_index, name="a") self.dt_series = Series(arr, index=self.dt_index, name="a") self.dt_tz_series = self.dt_tz_index.to_series() self.period_series = Series(arr, index=self.period_index, name="a") self.string_series = Series(arr, index=self.string_index, name="a") self.unicode_series = Series(arr, index=self.unicode_index, name="a") types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] self.indexes = [getattr(self, f"{t}_index") for t in types] self.series = [getattr(self, f"{t}_series") for t in types] # To test narrow dtypes, we use narrower *data* elements, not *index* elements index = self.int_index self.float32_series = Series(arr.astype(np.float32), index=index, name="a") arr_int = np.random.choice(10, size=10, replace=False) self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a") self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a") self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a") self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a") self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a") self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] self.objs = self.indexes + self.series + self.narrow_series