Example #1
0
def create_file(filename: str, num_records: int, ext: str, seed=42):
    # Set Random Seed for NumPy and Python
    np.random.seed(seed)
    random.seed(seed)

    # Write Markdown Header according to file type
    header = helpers.MARKDOWN_TABLE_HEADER
    if ext == "csv":
        # Write CSV Header
        header = helpers.CSV_HEADER
    with open(filename, "w+", buffering=8192) as file:
        file.write(header)

        # Generate random data, each is a list with num_records entries

        # random.sample() returns random numbers without replacement.
        roll_nos = random.sample(range(1000, int(1e7)), num_records)
        ages = np.random.randint(16, 21, size=num_records)

        # rands_array() returns rand-string-array containg num_records of names
        # each having 9 chracter first name and 5 char. lastname, separated
        # by space.
        names = rands_array(9, num_records) + " " + rands_array(5, num_records)
        branches = np.random.choice(["ECE", "CSE", "EEE", "Mech", "Civil"],
                                    size=num_records)

        # Write data to file, according to extension
        if ext == "csv":
            for r, n, b, a in zip(roll_nos, names, branches, ages):
                file.write(f"{r}, {n}, {b}, {a}\n")
            return
        for r, n, b, a in zip(roll_nos, names, branches, ages):
            file.write(f"| {r:7} | {n} | {b:^6} | {a:^4} |\n")
Example #2
0
def test_rank_apply():
    lev1 = tm.rands_array(10, 100)
    lev2 = tm.rands_array(10, 130)
    lab1 = np.random.randint(0, 100, size=500)
    lab2 = np.random.randint(0, 130, size=500)

    df = DataFrame({'value': np.random.randn(500),
                    'key1': lev1.take(lab1),
                    'key2': lev2.take(lab2)})

    result = df.groupby(['key1', 'key2']).value.rank()

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank())
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)

    result = df.groupby(['key1', 'key2']).value.rank(pct=True)

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank(pct=True))
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)
Example #3
0
def test_rank_apply():
    lev1 = tm.rands_array(10, 100)
    lev2 = tm.rands_array(10, 130)
    lab1 = np.random.randint(0, 100, size=500)
    lab2 = np.random.randint(0, 130, size=500)

    df = DataFrame({'value': np.random.randn(500),
                    'key1': lev1.take(lab1),
                    'key2': lev2.take(lab2)})

    result = df.groupby(['key1', 'key2']).value.rank()

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank())
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)

    result = df.groupby(['key1', 'key2']).value.rank(pct=True)

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank(pct=True))
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)
Example #4
0
def test_rank_apply():
    lev1 = tm.rands_array(10, 100)
    lev2 = tm.rands_array(10, 130)
    lab1 = np.random.randint(0, 100, size=500)
    lab2 = np.random.randint(0, 130, size=500)

    df = DataFrame({
        "value": np.random.randn(500),
        "key1": lev1.take(lab1),
        "key2": lev2.take(lab2),
    })

    result = df.groupby(["key1", "key2"]).value.rank()

    expected = [
        piece.value.rank() for key, piece in df.groupby(["key1", "key2"])
    ]
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)

    result = df.groupby(["key1", "key2"]).value.rank(pct=True)

    expected = [
        piece.value.rank(pct=True)
        for key, piece in df.groupby(["key1", "key2"])
    ]
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)
Example #5
0
def test_rands_array():
    arr = tm.rands_array(5, size=10)
    assert(arr.shape == (10,))
    assert(len(arr[0]) == 5)

    arr = tm.rands_array(7, size=(10, 10))
    assert(arr.shape == (10, 10))
    assert(len(arr[1, 1]) == 7)
Example #6
0
def test_rands_array():
    arr = tm.rands_array(5, size=10)
    assert (arr.shape == (10,))
    assert (len(arr[0]) == 5)

    arr = tm.rands_array(7, size=(10, 10))
    assert (arr.shape == (10, 10))
    assert (len(arr[1, 1]) == 7)
Example #7
0
def test_getitem_negative_out_of_bounds():
    s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))

    msg = "index out of bounds"
    with pytest.raises(IndexError, match=msg):
        s[-11]
    msg = "index -11 is out of bounds for axis 0 with size 10"
    with pytest.raises(IndexError, match=msg):
        s[-11] = "foo"
Example #8
0
def test_getitem_negative_out_of_bounds():
    s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))

    msg = "index out of bounds"
    with pytest.raises(IndexError, match=msg):
        s[-11]
    msg = "index -11 is out of bounds for axis 0 with size 10"
    with pytest.raises(IndexError, match=msg):
        s[-11] = 'foo'
    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]
Example #10
0
    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]
Example #11
0
def prepare_file(fpath):
    tmpdir = tempfile.gettempdir()
    fpath = os.path.join(fpath)
    writer = cyavro.AvroWriter(fpath, 'null', avroschema)

    ids = np.random.randint(100, size=10)
    ids = np.arange(10)
    names = pdt.rands_array(10, 10)
    df_write = pd.DataFrame({"id": ids, "name": names})
    df_write = cyavro.prepare_pandas_df_for_write(df_write, avroschema, copy=False)

    writer.write(df_write)
    writer.close()
    return df_write, fpath
Example #12
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = tm.rands_array(10, 10000)
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({'key1': key1, 'key2': key2,
                        'value1': np.random.randn(20000)})

        df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
                         'value2': np.random.randn(10000)})

        # just to hit the label compression code path
        merge(df, df2, how='outer')
Example #13
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = tm.rands_array(10, 10000)
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({'key1': key1, 'key2': key2,
                        'value1': np.random.randn(20000)})

        df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
                         'value2': np.random.randn(10000)})

        # just to hit the label compression code path
        merge(df, df2, how='outer')
Example #14
0
def prepare_file(fpath):
    tmpdir = tempfile.gettempdir()
    fpath = os.path.join(fpath)
    writer = cyavro.AvroWriter(fpath, 'null', avroschema)

    ids = np.random.randint(100, size=10)
    ids = np.arange(10)
    names = pdt.rands_array(10, 10)
    df_write = pd.DataFrame({"id": ids, "name": names})
    df_write = cyavro.prepare_pandas_df_for_write(df_write,
                                                  avroschema,
                                                  copy=False)

    writer.write(df_write)
    writer.close()
    return df_write, fpath
Example #15
0
    def test_series_frame_radd_bug(self):
        import operator

        # GH 353
        vals = Series(tm.rands_array(5, 10))
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        assert_series_equal(result, expected)

        frame = DataFrame({'vals': vals})
        result = 'foo_' + frame
        expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)

        # really raise this time
        self.assertRaises(TypeError, operator.add, datetime.now(), self.ts)
Example #16
0
    def test_series_frame_radd_bug(self):
        import operator

        # GH 353
        vals = Series(tm.rands_array(5, 10))
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        assert_series_equal(result, expected)

        frame = DataFrame({'vals': vals})
        result = 'foo_' + frame
        expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)

        # really raise this time
        self.assertRaises(TypeError, operator.add, datetime.now(), self.ts)
Example #17
0
    def test_series_frame_radd_bug(self):
        # GH#353
        vals = pd.Series(tm.rands_array(5, 10))
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        tm.assert_series_equal(result, expected)

        frame = pd.DataFrame({'vals': vals})
        result = 'foo_' + frame
        expected = pd.DataFrame({'vals': vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)

        ts = tm.makeTimeSeries()
        ts.name = 'ts'

        # really raise this time
        now = pd.Timestamp.now().to_pydatetime()
        with pytest.raises(TypeError):
            now + ts

        with pytest.raises(TypeError):
            ts + now
Example #18
0
    def test_series_frame_radd_bug(self):
        # GH#353
        vals = pd.Series(tm.rands_array(5, 10))
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        tm.assert_series_equal(result, expected)

        frame = pd.DataFrame({'vals': vals})
        result = 'foo_' + frame
        expected = pd.DataFrame({'vals': vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)

        ts = tm.makeTimeSeries()
        ts.name = 'ts'

        # really raise this time
        now = pd.Timestamp.now().to_pydatetime()
        with pytest.raises(TypeError):
            now + ts

        with pytest.raises(TypeError):
            ts + now
Example #19
0
def test_getitem_negative_out_of_bounds():
    s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))

    pytest.raises(IndexError, s.__getitem__, -11)
    pytest.raises(IndexError, s.__setitem__, -11, 'foo')
Example #20
0
    def test_long_strings(self):

        obj = Index(tm.rands_array(nchars=10000, size=100))
        self.check_equal(obj)
Example #21
0
 def test_same_len_hash_collisions(self, l_exp, l_add):
     length = 2**(l_exp + 8) + l_add
     s = tm.rands_array(length, 2)
     result = hash_array(s, 'utf8')
     assert not result[0] == result[1]
Example #22
0
def test_rands_array_1d():
    arr = tm.rands_array(5, size=10)
    assert(arr.shape == (10,))
    assert(len(arr[0]) == 5)
Example #23
0
def test_rands_array_2d():
    arr = tm.rands_array(7, size=(10, 10))
    assert(arr.shape == (10, 10))
    assert(len(arr[1, 1]) == 7)
Example #24
0
def test_getitem_negative_out_of_bounds():
    s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))

    pytest.raises(IndexError, s.__getitem__, -11)
    pytest.raises(IndexError, s.__setitem__, -11, 'foo')
Example #25
0
def test_rands_array_1d():
    arr = tm.rands_array(5, size=10)
    assert (arr.shape == (10, ))
    assert (len(arr[0]) == 5)
Example #26
0
def test_rands_array_2d():
    arr = tm.rands_array(7, size=(10, 10))
    assert (arr.shape == (10, 10))
    assert (len(arr[1, 1]) == 7)
Example #27
0
 def test_very_wide_info_repr(self):
     df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
     repr(df)
Example #28
0
 def test_same_len_hash_collisions(self, l_exp, l_add):
     length = 2**(l_exp + 8) + l_add
     s = tm.rands_array(length, 2)
     result = hash_array(s, 'utf8')
     assert not result[0] == result[1]
Example #29
0
    def test_long_strings(self):

        obj = Index(tm.rands_array(nchars=10000, size=100))
        self.check_equal(obj)
Example #30
0
 def test_very_wide_info_repr(self):
     df = DataFrame(np.random.randn(10, 20),
                    columns=tm.rands_array(10, 20))
     repr(df)