def create_file(filename: str, num_records: int, ext: str, seed=42): # Set Random Seed for NumPy and Python np.random.seed(seed) random.seed(seed) # Write Markdown Header according to file type header = helpers.MARKDOWN_TABLE_HEADER if ext == "csv": # Write CSV Header header = helpers.CSV_HEADER with open(filename, "w+", buffering=8192) as file: file.write(header) # Generate random data, each is a list with num_records entries # random.sample() returns random numbers without replacement. roll_nos = random.sample(range(1000, int(1e7)), num_records) ages = np.random.randint(16, 21, size=num_records) # rands_array() returns rand-string-array containg num_records of names # each having 9 chracter first name and 5 char. lastname, separated # by space. names = rands_array(9, num_records) + " " + rands_array(5, num_records) branches = np.random.choice(["ECE", "CSE", "EEE", "Mech", "Civil"], size=num_records) # Write data to file, according to extension if ext == "csv": for r, n, b, a in zip(roll_nos, names, branches, ages): file.write(f"{r}, {n}, {b}, {a}\n") return for r, n, b, a in zip(roll_nos, names, branches, ages): file.write(f"| {r:7} | {n} | {b:^6} | {a:^4} |\n")
def test_rank_apply(): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) lab1 = np.random.randint(0, 100, size=500) lab2 = np.random.randint(0, 130, size=500) df = DataFrame({'value': np.random.randn(500), 'key1': lev1.take(lab1), 'key2': lev2.take(lab2)}) result = df.groupby(['key1', 'key2']).value.rank() expected = [] for key, piece in df.groupby(['key1', 'key2']): expected.append(piece.value.rank()) expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) result = df.groupby(['key1', 'key2']).value.rank(pct=True) expected = [] for key, piece in df.groupby(['key1', 'key2']): expected.append(piece.value.rank(pct=True)) expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected)
def test_rank_apply(): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) lab1 = np.random.randint(0, 100, size=500) lab2 = np.random.randint(0, 130, size=500) df = DataFrame({ "value": np.random.randn(500), "key1": lev1.take(lab1), "key2": lev2.take(lab2), }) result = df.groupby(["key1", "key2"]).value.rank() expected = [ piece.value.rank() for key, piece in df.groupby(["key1", "key2"]) ] expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) result = df.groupby(["key1", "key2"]).value.rank(pct=True) expected = [ piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) ] expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected)
def test_rands_array(): arr = tm.rands_array(5, size=10) assert(arr.shape == (10,)) assert(len(arr[0]) == 5) arr = tm.rands_array(7, size=(10, 10)) assert(arr.shape == (10, 10)) assert(len(arr[1, 1]) == 7)
def test_rands_array(): arr = tm.rands_array(5, size=10) assert (arr.shape == (10,)) assert (len(arr[0]) == 5) arr = tm.rands_array(7, size=(10, 10)) assert (arr.shape == (10, 10)) assert (len(arr[1, 1]) == 7)
def test_getitem_negative_out_of_bounds(): s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) msg = "index out of bounds" with pytest.raises(IndexError, match=msg): s[-11] msg = "index -11 is out of bounds for axis 0 with size 10" with pytest.raises(IndexError, match=msg): s[-11] = "foo"
def test_getitem_negative_out_of_bounds(): s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) msg = "index out of bounds" with pytest.raises(IndexError, match=msg): s[-11] msg = "index -11 is out of bounds for axis 0 with size 10" with pytest.raises(IndexError, match=msg): s[-11] = 'foo'
def test_same_len_hash_collisions(self): for l in range(8): length = 2**(l + 8) + 1 s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1] for l in range(8): length = 2**(l + 8) s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1]
def prepare_file(fpath): tmpdir = tempfile.gettempdir() fpath = os.path.join(fpath) writer = cyavro.AvroWriter(fpath, 'null', avroschema) ids = np.random.randint(100, size=10) ids = np.arange(10) names = pdt.rands_array(10, 10) df_write = pd.DataFrame({"id": ids, "name": names}) df_write = cyavro.prepare_pandas_df_for_write(df_write, avroschema, copy=False) writer.write(df_write) writer.close() return df_write, fpath
def test_compress_group_combinations(self): # ~ 40000000 possible unique groups key1 = tm.rands_array(10, 10000) key1 = np.tile(key1, 2) key2 = key1[::-1] df = DataFrame({'key1': key1, 'key2': key2, 'value1': np.random.randn(20000)}) df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], 'value2': np.random.randn(10000)}) # just to hit the label compression code path merge(df, df2, how='outer')
def test_series_frame_radd_bug(self): import operator # GH 353 vals = Series(tm.rands_array(5, 10)) result = 'foo_' + vals expected = vals.map(lambda x: 'foo_' + x) assert_series_equal(result, expected) frame = DataFrame({'vals': vals}) result = 'foo_' + frame expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) tm.assert_frame_equal(result, expected) # really raise this time self.assertRaises(TypeError, operator.add, datetime.now(), self.ts)
def test_series_frame_radd_bug(self): # GH#353 vals = pd.Series(tm.rands_array(5, 10)) result = 'foo_' + vals expected = vals.map(lambda x: 'foo_' + x) tm.assert_series_equal(result, expected) frame = pd.DataFrame({'vals': vals}) result = 'foo_' + frame expected = pd.DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) tm.assert_frame_equal(result, expected) ts = tm.makeTimeSeries() ts.name = 'ts' # really raise this time now = pd.Timestamp.now().to_pydatetime() with pytest.raises(TypeError): now + ts with pytest.raises(TypeError): ts + now
def test_getitem_negative_out_of_bounds(): s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) pytest.raises(IndexError, s.__getitem__, -11) pytest.raises(IndexError, s.__setitem__, -11, 'foo')
def test_long_strings(self): obj = Index(tm.rands_array(nchars=10000, size=100)) self.check_equal(obj)
def test_same_len_hash_collisions(self, l_exp, l_add): length = 2**(l_exp + 8) + l_add s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1]
def test_rands_array_1d(): arr = tm.rands_array(5, size=10) assert(arr.shape == (10,)) assert(len(arr[0]) == 5)
def test_rands_array_2d(): arr = tm.rands_array(7, size=(10, 10)) assert(arr.shape == (10, 10)) assert(len(arr[1, 1]) == 7)
def test_rands_array_1d(): arr = tm.rands_array(5, size=10) assert (arr.shape == (10, )) assert (len(arr[0]) == 5)
def test_rands_array_2d(): arr = tm.rands_array(7, size=(10, 10)) assert (arr.shape == (10, 10)) assert (len(arr[1, 1]) == 7)
def test_very_wide_info_repr(self): df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20)) repr(df)