Esempio n. 1
0
def dfs_arrow(tmp_path_factory):
    tmpdir = tmp_path_factory.mktemp("vaex")
    path = str(tmpdir / 'strings.hdf5')
    df = vaex.from_arrays(s=vaex.string_column(string_list),
                          sr=vaex.string_column(string_list_reverse))
    df.export(path)  # we write it out so that the memory is read only
    return vaex.open(path)
Esempio n. 2
0
def create_base_ds():
    dataset = vaex.dataset.DatasetArrays("dataset")
    x = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0]
    y = y = x**2
    ints = np.arange(-2, 19, dtype="i8")
    ints[0] = 2**62 + 1
    ints[1] = -2**62 + 1
    ints[2] = -2**62 - 1
    ints[0 + 10] = 2**62 + 1
    ints[1 + 10] = -2**62 + 1
    ints[2 + 10] = -2**62 - 1
    dataset.add_column("x", x)
    dataset.add_column("y", y)
    # m = x.copy()
    m = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0]
    ma_value = 77777
    m[-1 + 10] = ma_value
    m[-1 + 20] = ma_value
    m = np.ma.array(m, mask=m == ma_value)

    n = x.copy()
    n[-2 + 10] = np.nan
    n[-2 + 20] = np.nan

    nm = x.copy()
    nm[-2 + 10] = np.nan
    nm[-2 + 20] = np.nan
    nm[-1 + 10] = ma_value
    nm[-1 + 20] = ma_value
    nm = np.ma.array(nm, mask=nm == ma_value)

    mi = np.ma.array(m.data.astype(np.int64),
                     mask=m.data == ma_value,
                     fill_value=88888)
    dataset.add_column("m", m)
    dataset.add_column('n', n)
    dataset.add_column('nm', nm)
    dataset.add_column("mi", mi)
    dataset.add_column("ints", ints)

    name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)),
                    dtype='U')  #, dtype=np.string_)
    dataset.add_column("name", np.array(name))
    dataset.add_column("name_arrow", vaex.string_column(name))

    obj_data = np.array([
        'train', 'false', True, 1, 30., np.nan, 'something',
        'something a bit longer resembling a sentence?!', -10000,
        'this should be masked'
    ],
                        dtype='object')
    obj_mask = np.array([False] * 9 + [True])
    obj = nm.copy().astype('object')
    obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object')
    dataset.add_column("obj", obj)

    return dataset


# dsf = create_filtered()
Esempio n. 3
0
def test_arrow_strings():
    N = 4
    x = ['a', 'bb', 'ccc', 'dddd']
    xc = vaex.string_column(x)
    df = vaex.from_arrays(x=xc)
    assert len(df.columns['x']) == 4
    trimmed = df.columns['x'][2:4]
    assert trimmed[:].tolist() == x[2:4]
    assert trimmed[1:2].tolist() == x[3:4]
    assert len(df) == N
    assert len(df[1:3]) == 2
    assert df[1:3].x.tolist() == x[1:3]

    indices = np.array([0, 2, 1, 3])
    assert xc.take(indices).tolist() == ['a', 'ccc', 'bb', 'dddd']

    indices_masked = np.ma.array(indices, mask=[False, True, False, False])
    assert xc.take(indices_masked).tolist() == ['a', None, 'bb', 'dddd']

    indices = np.array([0, 2, 1, 3])
    assert xc.take(indices).tolist() == ['a', 'ccc', 'bb', 'dddd']

    mask = np.array([True, True, False, True])
    assert vaex.array_types.filter(xc, mask).tolist() == ['a', 'bb', 'dddd']

    mask_masked = np.ma.array(np.array([True, True, False, True]), mask=[False, True, True, False])
    assert vaex.array_types.filter(xc, mask_masked).tolist() == ['a', 'dddd']
Esempio n. 4
0
def test_arrow_strings_null():
    N = 4
    x = ['a', 'bb', None, 'dddd', None]
    xc = vaex.string_column(x)
    assert xc.tolist() == x
    assert xc[1:].tolist() == x[1:]
    assert xc[2:4].tolist() == x[2:4]
Esempio n. 5
0
def test_unique_arrow(df_factory):
    ds = df_factory(x=vaex.string_column(
        ['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a']))
    with small_buffer(ds, 2):
        assert set(ds.unique(ds.x)) == {'a', 'b'}
        values, index = ds.unique(ds.x, return_inverse=True)
        assert np.array(values)[index].tolist() == ds.x.tolist()
Esempio n. 6
0
def test_string_strip_special_case():
    strings = [
        "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? "
        "They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. "
        "And please don't remove the template from the talk page since I'm retired now.89.205.38.27"
    ]
    df = vaex.from_arrays(s=vaex.string_column(strings))
    df.s.str.strip(' ').values  # .get(0)
Esempio n. 7
0
def test_arrow_strings():
    N = 4
    x = ['a', 'bb', 'ccc', 'dddd']
    xc = vaex.string_column(x)
    df = vaex.from_arrays(x=xc)
    assert len(df.columns['x']) == 4
    trimmed = df.columns['x'][2:4]
    assert trimmed[:].tolist() == x[2:4]
    assert len(df) == N
    assert len(df[1:3]) == 2
    assert df[1:3].x.tolist() == x[1:3]
Esempio n. 8
0
def test_is_na():
    s = vaex.string_column(["aap", None, "noot", "mies"])
    o = ["aap", None, False, np.nan]
    x = np.arange(4, dtype=np.float64)
    x[2] = x[3] = np.nan
    m = np.ma.array(x, mask=[0, 1, 0, 1])
    df = vaex.from_arrays(x=x, m=m, s=s, o=o)
    assert (df.x.isna().tolist() == [False, False, True, True])
    assert (df.m.isna().tolist() == [False, True, True, True])
    assert (df.s.isna().tolist() == [False, True, False, False])
    assert (df.o.isna().tolist() == [False, True, False, True])
Esempio n. 9
0
def test_arrow_strings():
    N = 4
    x = ['a', 'bb', 'ccc', 'dddd']
    xc = vaex.string_column(x)
    df = vaex.from_arrays(x=xc)
    assert len(df.columns['x']) == 4
    trimmed = df.columns['x'][2:4]
    assert trimmed[:].tolist() == x[2:4]
    assert len(df) == N
    assert len(df[1:3]) == 2
    assert df[1:3].x.tolist() == x[1:3]
Esempio n. 10
0
def test_string_count_stat():
    ds = vaex.from_arrays(names=['hello', 'this', 'is', 'long'])
    assert ds.count(ds.names) == 4
    ds = vaex.from_arrays(names=np.ma.array(['hello', 'this', 'is', 'long'], mask=[0, 0, 1, 0]))
    assert ds.count(ds.names) == 3
    df = vaex.from_arrays(names=np.array(['hi', 'is', 'l2', np.nan], dtype='O'))
    assert df.count(ds.names) == 3

    names = vaex.string_column(['hello', 'this', None, 'long'])
    x = np.arange(len(names))
    df = vaex.from_arrays(names=names, x=x)
    assert df.count(df.names, binby='x', limits=[0, 100], shape=1).tolist() == [3]
Esempio n. 11
0
def test_concat_mixed_types():
    x1 = np.zeros(3) + np.nan
    x2 = vaex.string_column(['hi', 'there'])
    df1 = vaex.from_arrays(x=x1)
    df2 = vaex.from_arrays(x=x2)
    df = vaex.concat([df1, df2])
    assert df2.x.dtype == df.x.dtype, "expect 'upcast' to string"
    assert df[:2].x.tolist() == ['nan', 'nan']
    assert df[1:4].x.tolist() == ['nan', 'nan', 'hi']
    assert df[2:4].x.tolist() == ['nan', 'hi']
    assert df[3:4].x.tolist() == ['hi']
    assert df[3:5].x.tolist() == ['hi', 'there']
Esempio n. 12
0
def test_unique_categorical(df_factory, future):
    df = df_factory(x=vaex.string_column(['a', 'c', 'b', 'a', 'a']))
    df = df.ordinal_encode('x')
    df = df._future() if future else df
    if future:
        assert df.x.dtype == str
        assert set(df.x.unique()) == {'a', 'b', 'c'}
        assert df.x.nunique() == 3
    else:
        assert df.x.dtype == int
        assert set(df.x.unique()) == {0, 1, 2}
        assert df.x.nunique() == 3
Esempio n. 13
0
def test_null_values():
    df = vaex.from_arrays(s=vaex.string_column(['aap', None, 'mies']),
                          x=[0, 1, 2])
    assert df.count() == 3
    assert df.count(df.s) == 2
    assert df.count(df.s, selection=df.x > 0) == 1
Esempio n. 14
0
def dfs_arrow():
    return vaex.from_arrays(s=vaex.string_column(string_list),
                            sr=vaex.string_column(string_list_reverse))
Esempio n. 15
0
def create_base_ds():
    x = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0]
    columns = {'x': x}
    y = y = x**2
    ints = np.arange(-2, 19, dtype="i8")
    ints[0] = 2**62 + 1
    ints[1] = -2**62 + 1
    ints[2] = -2**62 - 1
    ints[0 + 10] = 2**62 + 1
    ints[1 + 10] = -2**62 + 1
    ints[2 + 10] = -2**62 - 1
    columns["x"] = x
    columns["y"] = y
    # m = x.copy()
    m = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0]
    ma_value = 77777
    m[-1 + 10 + 2] = ma_value
    m[-1 + 20] = ma_value
    m = np.ma.array(m, mask=m == ma_value)

    n = x.copy()
    n[-2 + 10] = np.nan
    n[-2 + 20] = np.nan

    nm = x.copy()
    nm[-2 + 10] = np.nan
    nm[-2 + 20] = np.nan
    nm[-1 + 10] = ma_value
    nm[-1 + 20] = ma_value
    nm = np.ma.array(nm, mask=nm == ma_value)

    mi = np.ma.array(m.data.astype(np.int64),
                     mask=m.data == ma_value,
                     fill_value=88888)
    columns["m"] = m
    columns['n'] = n
    columns['nm'] = nm
    columns["mi"] = mi
    columns["ints"] = ints

    name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)),
                    dtype='U')  #, dtype=np.string_)
    columns["name"] = np.array(name)
    columns["name_arrow"] = vaex.string_column(name)

    obj_data = np.array([
        'train', 'false', True, 1, 30., np.nan, 'something',
        'something a bit longer resembling a sentence?!', -10000,
        'this should be masked'
    ],
                        dtype='object')
    obj_mask = np.array([False] * 9 + [True])
    obj = nm.copy().astype('object')
    obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object')
    columns["obj"] = obj  #, dtype=np.dtype('O')

    booleans = np.ones(21, dtype=np.bool)
    booleans[[4, 6, 8, 14, 16, 19]] = False
    columns["bool"] = booleans

    datetime = np.array([
        '2016-02-29T22:02:02.32',
        '2013-01-17T01:02:03.32',
        '2017-11-11T08:15:15.00',
        '1995-04-01T05:55:55.55',
        '2000-01-01T00:00:00.00',
        '2019-03-05T09:12:13.51',
        '1993-10-15T17:23:47.00',
        '2001-09-15T00:00:00.15',
        '2019-02-18T13:12:10.09',
        '1991-07-12T16:17:33.11',
        '2005-05-05T05:05:05.05',
        '2011-08-27T03:06:15.00',
        '1999-07-09T09:01:33.21',
        '2018-04-04T17:30:00.00',
        '2012-12-01T21:00:00.01',
        '1994-05-02T11:22:33.00',
        '2003-07-02T22:33:00.00',
        '2014-06-03T06:30:00.00',
        '1997-09-04T20:31:00.11',
        '2004-02-24T04:00:00.00',
        '2000-06-15T12:30:30.00',
    ],
                        dtype=np.datetime64)
    timedelta = datetime - np.datetime64('1996-05-17T16:45:00.00')
    columns["datetime"] = datetime
    columns["timedelta"] = timedelta
    columns["123456"] = x  # a column that will have an alias

    df = vaex.from_arrays(**columns)
    df.add_virtual_column("z", "x+t*y")
    df.set_variable("t", 1.)
    return df._readonly()
Esempio n. 16
0
def create_base_ds():
    dataset = vaex.dataset.DatasetArrays("dataset")
    x = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0]
    y = y = x**2
    ints = np.arange(-2, 19, dtype="i8")
    ints[0] = 2**62 + 1
    ints[1] = -2**62 + 1
    ints[2] = -2**62 - 1
    ints[0 + 10] = 2**62 + 1
    ints[1 + 10] = -2**62 + 1
    ints[2 + 10] = -2**62 - 1
    dataset.add_column("x", x)
    dataset.add_column("y", y)
    # m = x.copy()
    m = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0]
    ma_value = 77777
    m[-1 + 10] = ma_value
    m[-1 + 20] = ma_value
    m = np.ma.array(m, mask=m == ma_value)

    n = x.copy()
    n[-2 + 10] = np.nan
    n[-2 + 20] = np.nan

    nm = x.copy()
    nm[-2 + 10] = np.nan
    nm[-2 + 20] = np.nan
    nm[-1 + 10] = ma_value
    nm[-1 + 20] = ma_value
    nm = np.ma.array(nm, mask=nm == ma_value)

    mi = np.ma.array(m.data.astype(np.int64),
                     mask=m.data == ma_value,
                     fill_value=88888)
    dataset.add_column("m", m)
    dataset.add_column('n', n)
    dataset.add_column('nm', nm)
    dataset.add_column("mi", mi)
    dataset.add_column("ints", ints)

    name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)),
                    dtype='U')  #, dtype=np.string_)
    dataset.add_column("name", np.array(name))
    dataset.add_column("name_arrow", vaex.string_column(name))

    obj_data = np.array([
        'train', 'false', True, 1, 30., np.nan, 'something',
        'something a bit longer resembling a sentence?!', -10000,
        'this should be masked'
    ],
                        dtype='object')
    obj_mask = np.array([False] * 9 + [True])
    obj = nm.copy().astype('object')
    obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object')
    dataset.add_column("obj", obj)

    booleans = np.ones(21, dtype=np.bool)
    booleans[[4, 6, 8, 14, 16, 19]] = False
    dataset.add_column("bool", booleans)

    datetime = np.array([
        '2016-02-29T22:02:02.32',
        '2013-01-17T01:02:03.32',
        '2017-11-11T08:15:15.00',
        '1995-04-01T05:55:55.55',
        '2000-01-01T00:00:00.00',
        '2019-03-05T09:12:13.51',
        '1993-10-15T17:23:47.00',
        '2001-09-15T00:00:00.15',
        '2019-02-18T13:12:10.09',
        '1991-07-12T16:17:33.11',
        '2005-05-05T05:05:05.05',
        '2011-08-27T03:06:15.00',
        '1999-07-09T09:01:33.21',
        '2018-04-04T17:30:00.00',
        '2012-12-01T21:00:00.01',
        '1994-05-02T11:22:33.00',
        '2003-07-02T22:33:00.00',
        '2014-06-03T06:30:00.00',
        '1997-09-04T20:31:00.11',
        '2004-02-24T04:00:00.00',
        '2000-06-15T12:30:30.00',
    ],
                        dtype=np.datetime64)
    timedelta = datetime - np.datetime64('1996-05-17T16:45:00.00')
    dataset.add_column("datetime", datetime)
    dataset.add_column("timedelta", timedelta)

    return dataset._readonly()
Esempio n. 17
0
def create_base_ds():
    dataset = vaex.dataset.DatasetArrays("dataset")
    x = np.arange(-2, 40, dtype=">f8").reshape((-1,21)).T.copy()[:,0]
    y = y = x ** 2
    ints = np.arange(-2,19, dtype="i8")
    ints[0] = 2**62+1
    ints[1] = -2**62+1
    ints[2] = -2**62-1
    ints[0+10] = 2**62+1
    ints[1+10] = -2**62+1
    ints[2+10] = -2**62-1
    dataset.add_column("x", x)
    dataset.add_column("y", y)
    # m = x.copy()
    m = np.arange(-2, 40, dtype=">f8").reshape((-1,21)).T.copy()[:,0]
    ma_value = 77777
    m[-1+10] = ma_value
    m[-1+20] = ma_value
    m = np.ma.array(m, mask=m==ma_value)

    n = x.copy()
    n[-2+10] = np.nan
    n[-2+20] = np.nan

    nm = x.copy()
    nm[-2+10] = np.nan
    nm[-2+20] = np.nan
    nm[-1+10] = ma_value
    nm[-1+20] = ma_value
    nm = np.ma.array(nm, mask=nm==ma_value)

    mi = np.ma.array(m.data.astype(np.int64), mask=m.data==ma_value, fill_value=88888)
    dataset.add_column("m", m)
    dataset.add_column('n', n)
    dataset.add_column('nm', nm)
    dataset.add_column("mi", mi)
    dataset.add_column("ints", ints)

    name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)), dtype='U') #, dtype=np.string_)
    dataset.add_column("name", np.array(name))
    dataset.add_column("name_arrow", vaex.string_column(name))

    obj_data = np.array(['train', 'false' , True, 1, 30., np.nan, 'something', 'something a bit longer resembling a sentence?!', -10000, 'this should be masked'], dtype='object')
    obj_mask = np.array([False] * 9 + [True])
    obj = nm.copy().astype('object')
    obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object')
    dataset.add_column("obj", obj)

    booleans = np.ones(21, dtype=np.bool)
    booleans[[4, 6, 8, 14, 16, 19]] = False
    dataset.add_column("bool", booleans)

    datetime = np.array(['2016-02-29T22:02:02.32', '2013-01-17T01:02:03.32', '2017-11-11T08:15:15.00',
                         '1995-04-01T05:55:55.55', '2000-01-01T00:00:00.00', '2019-03-05T09:12:13.51',
                         '1993-10-15T17:23:47.00', '2001-09-15T00:00:00.15', '2019-02-18T13:12:10.09',
                         '1991-07-12T16:17:33.11', '2005-05-05T05:05:05.05', '2011-08-27T03:06:15.00',
                         '1999-07-09T09:01:33.21', '2018-04-04T17:30:00.00', '2012-12-01T21:00:00.01',
                         '1994-05-02T11:22:33.00', '2003-07-02T22:33:00.00', '2014-06-03T06:30:00.00',
                         '1997-09-04T20:31:00.11', '2004-02-24T04:00:00.00', '2000-06-15T12:30:30.00',
                         ],dtype=np.datetime64)
    timedelta = datetime - np.datetime64('1996-05-17T16:45:00.00')
    dataset.add_column("datetime", datetime)
    dataset.add_column("timedelta", timedelta)

    return dataset._readonly()
Esempio n. 18
0
def test_string_strip_special_case2():
    strings = [
        'The eunuch in question left me no choice but to reinsert it. Take action as you see fit.·snunɐw·'
    ]
    df = vaex.from_arrays(s=vaex.string_column(strings))
    assert df.s.str.upper().tolist() == df.s.str_pandas.upper().tolist()
Esempio n. 19
0
def test_string_strip_special_case2():
    strings = ['ɐa', 'aap']
    df = vaex.from_arrays(s=vaex.string_column(strings))
    assert df.s.str.capitalize().tolist() == df.s.str_pandas.capitalize(
    ).tolist()
Esempio n. 20
0
def test_concat_arrow_strings():
    df1 = vaex.from_arrays(x=vaex.string_column(['aap', 'noot', 'mies']))
    df2 = vaex.from_arrays(x=vaex.string_column(['a', 'b', 'c']))
    df = vaex.concat([df1, df2])
    assert df.data_type('x') == df1.data_type('x')
    assert df.x.tolist() == ['aap', 'noot', 'mies', 'a', 'b', 'c']