Esempio n. 1
0
def test_concat_functions():
    def foo(a, b):
        return a + b

    df1 = vaex.from_scalars(x=1, y=2)
    df2 = vaex.from_scalars(x=2, y=3)
    df1.add_function('foo', foo)
    df2.add_function('foo', foo)
    # w has same expression and function
    df1['w'] = df1.func.foo(df1.x, df1.y)
    df2['w'] = df2.func.foo(df2.x, df2.y)
    assert df1.w.tolist() == [3]
    df = vaex.concat([df1, df2])
    assert df.w.tolist() == [1 + 2, 2 + 3]

    # now bar is a new function
    def bar1(a, b):
        return a + b

    def bar2(a, b):
        return a + b

    df1 = vaex.from_scalars(x=1, y=2)
    df2 = vaex.from_scalars(x=2, y=3)
    df1.add_function('bar', bar1)
    df2.add_function('bar', bar2)
    with pytest.raises(ValueError):
        df = vaex.concat([df1, df2])
Esempio n. 2
0
def test_concat():
    x1, y1, z1 = np.arange(3), np.arange(3, 0, -1), np.arange(10, 13)
    x2, y2, z2 = np.arange(3, 6), np.arange(0, -3, -1), np.arange(13, 16)
    x3, y3, z3 = np.arange(6, 9), np.arange(-3, -6, -1), np.arange(16, 19)
    w1, w2, w3 = np.array(['cat']*3), np.array(['dog']*3), np.array(['fish']*3)
    x = np.concatenate((x1, x2, x3))
    y = np.concatenate((y1, y2, y3))
    z = np.concatenate((z1, z2, z3))
    w = np.concatenate((w1, w2, w3))

    ds = vaex.from_arrays(x=x, y=y, z=z, w=w)
    ds1 = vaex.from_arrays(x=x1, y=y1, z=z1, w=w1)
    ds2 = vaex.from_arrays(x=x2, y=y2, z=z2, w=w2)
    ds3 = vaex.from_arrays(x=x3, y=y3, z=z3, w=w3)

    dd = vaex.concat([ds1, ds2])
    ww = ds1.concat(ds2)

    # Test if the concatination of two arrays with the vaex method is the same as with the dataset method
    assert (np.array(dd.evaluate('x,y,z,w'.split(','))) == np.array(ww.evaluate('x,y,z,w'.split(',')))).all()

    # Test if the concatination of multiple datasets works
    dd = vaex.concat([ds1, ds2, ds3])
    assert (np.array(dd.evaluate('x')) == np.array(ds.evaluate('x'))).all()
    assert (np.array(dd.evaluate('y')) == np.array(ds.evaluate('y'))).all()
    assert (np.array(dd.evaluate('z')) == np.array(ds.evaluate('z'))).all()
    assert (np.array(dd.evaluate('w')) == np.array(ds.evaluate('w'))).all()

    # Test if the concatination of concatinated datasets works
    dd1 = vaex.concat([ds1, ds2])
    dd2 = vaex.concat([dd1, ds3])
    assert (np.array(dd2.evaluate('x')) == np.array(ds.evaluate('x'))).all()
    assert (np.array(dd2.evaluate('y')) == np.array(ds.evaluate('y'))).all()
    assert (np.array(dd2.evaluate('z')) == np.array(ds.evaluate('z'))).all()
    assert (np.array(dd2.evaluate('w')) == np.array(ds.evaluate('w'))).all()
Esempio n. 3
0
def _from_csv_convert_and_read(filename_or_buffer,
                               path_output,
                               chunk_size,
                               fs_options,
                               fs=None,
                               copy_index=False,
                               progress=None,
                               **kwargs):
    # figure out the CSV file path
    csv_path = vaex.file.stringyfy(filename_or_buffer)
    path_output_bare, ext, _ = vaex.file.split_ext(path_output)

    combined_hdf5 = _convert_name(csv_path)

    # convert CSV chunks to separate HDF5 files
    import pandas as pd
    converted_paths = []
    # we don't have indeterminate progress bars, so we cast it to truethy
    progress = bool(progress) if progress is not None else False
    if progress:
        print("Converting csv to chunk files")
    with vaex.file.open(filename_or_buffer,
                        fs_options=fs_options,
                        fs=fs,
                        for_arrow=True) as f:
        csv_reader = pd.read_csv(filename_or_buffer,
                                 chunksize=chunk_size,
                                 **kwargs)
        for i, df_pandas in enumerate(csv_reader):
            df = vaex.from_pandas(df_pandas, copy_index=copy_index)
            chunk_name = f'{path_output_bare}_chunk_{i}{ext}'
            df.export(chunk_name)
            converted_paths.append(chunk_name)
            log.info('saved chunk #%d to %s' % (i, chunk_name))
            if progress:
                print("Saved chunk #%d to %s" % (i, chunk_name))

    # combine chunks into one HDF5 file
    if len(converted_paths) == 1:
        # no need to merge several HDF5 files
        os.rename(converted_paths[0], path_output)
    else:
        if progress:
            print('Converting %d chunks into single file %s' %
                  (len(converted_paths), path_output))
        log.info('converting %d chunks into single file %s' %
                 (len(converted_paths), path_output))
        dfs = [vaex.open(p) for p in converted_paths]
        df_combined = vaex.concat(dfs)
        df_combined.export(path_output, progress=progress)

        log.info('deleting %d chunk files' % len(converted_paths))
        for df, df_path in zip(dfs, converted_paths):
            try:
                df.close()
                os.remove(df_path)
            except Exception as e:
                log.error(
                    'Could not close or delete intermediate file %s used to convert %s to single file: %s',
                    (df_path, csv_path, path_output))
Esempio n. 4
0
def df_concat_cache(ds_trimmed_cache):
    df = ds_trimmed_cache
    df1 = df[:2]  # length 2
    df2 = df[2:3]  # length 1
    df3 = df[3:7]  # length 4
    df4 = df[7:10]  # length 3
    return vaex.concat([df1, df2, df3, df4])
Esempio n. 5
0
def _from_csv_convert_and_read(filename_or_buffer,
                               maybe_convert_path,
                               chunk_size,
                               fs_options,
                               fs=None,
                               copy_index=False,
                               **kwargs):
    # figure out the CSV file path
    if isinstance(maybe_convert_path, str):
        csv_path = re.sub(r'\.hdf5$',
                          '',
                          str(maybe_convert_path),
                          flags=re.IGNORECASE)
    elif isinstance(filename_or_buffer, str):
        csv_path = filename_or_buffer
    else:
        raise ValueError(
            'Cannot derive filename to use for converted HDF5 file, '
            'please specify it using convert="my.csv.hdf5"')

    combined_hdf5 = _convert_name(csv_path)

    # convert CSV chunks to separate HDF5 files
    import pandas as pd
    converted_paths = []
    with vaex.file.open(filename_or_buffer,
                        fs_options=fs_options,
                        fs=fs,
                        for_arrow=True) as f:
        csv_reader = pd.read_csv(filename_or_buffer,
                                 chunksize=chunk_size,
                                 **kwargs)
        for i, df_pandas in enumerate(csv_reader):
            df = vaex.from_pandas(df_pandas, copy_index=copy_index)
            filename_hdf5 = _convert_name(csv_path, suffix='_chunk%d' % i)
            df.export_hdf5(filename_hdf5)
            converted_paths.append(filename_hdf5)
            log.info('saved chunk #%d to %s' % (i, filename_hdf5))

    # combine chunks into one HDF5 file
    if len(converted_paths) == 1:
        # no need to merge several HDF5 files
        os.rename(converted_paths[0], combined_hdf5)
    else:
        log.info('converting %d chunks into single HDF5 file %s' %
                 (len(converted_paths), combined_hdf5))
        dfs = [vaex.open(p) for p in converted_paths]
        df_combined = vaex.concat(dfs)
        df_combined.export_hdf5(combined_hdf5)

        log.info('deleting %d chunk files' % len(converted_paths))
        for df, df_path in zip(dfs, converted_paths):
            try:
                df.close()
                os.remove(df_path)
            except Exception as e:
                log.error(
                    'Could not close or delete intermediate hdf5 file %s used to convert %s to hdf5: %s'
                    % (df_path, csv_path, e))
Esempio n. 6
0
def test_concat_missing_values():
    df1 = vaex.from_arrays(x=[1, 2, 3], y=[np.nan, 'b', 'c'])
    df2 = vaex.from_arrays(x=[4, 5, np.nan], y=['d', 'e', 'f'])
    df = vaex.concat([df1, df2])

    repr(df.head(4))
    repr(df.tail(4))
    assert len(df) == 6
Esempio n. 7
0
def test_concat_keep_virtual():
    df1 = vaex.from_scalars(x=1, y=2)
    df2 = vaex.from_scalars(x=2, y=3)
    # w has same expression
    df1['w'] = df1.x + df1.y
    df2['w'] = df2.x + df2.y
    df = vaex.concat([df1, df2])
    assert 'w' in df.virtual_columns
    assert 'w' not in df.get_column_names(virtual=False)
    assert 'w' not in df.dataset
Esempio n. 8
0
def _iris(name, iris_previous, N):
    filename = os.path.join(vaex.utils.get_private_dir('data'), name + '.hdf5')
    if os.path.exists(filename):
        return vaex.open(filename)
    else:
        iris = iris_previous()
        repeat = int(np.ceil(N / len(iris)))
        ds = vaex.concat([iris] * repeat)
        ds.export_hdf5(filename)
        return vaex.open(filename)
Esempio n. 9
0
def test_concat_unequals_virtual_columns():
    ds1 = vaex.from_scalars(x=1, y=2)
    ds2 = vaex.from_scalars(x=2, y=3)
    # w has same expression
    ds1['w'] = ds1.x + ds1.y
    ds2['w'] = ds2.x + ds2.y
    # z does not
    ds1['z'] = ds1.x + ds1.y
    ds2['z'] = ds2.x * ds2.y
    ds = vaex.concat([ds1, ds2])
    assert ds.w.tolist() == [1 + 2, 2 + 3]
    assert ds.z.tolist() == [1 + 2, 2 * 3]
Esempio n. 10
0
def test_concat_unequals_virtual_columns():
    df1 = vaex.from_scalars(x=1, y=2)
    df2 = vaex.from_scalars(x=2, y=3)
    # w has same expression
    df1['w'] = df1.x + df1.y
    df2['w'] = df2.x + df2.y
    # z does not
    df1['z'] = df1.x + df1.y
    df2['z'] = df2.x * df2.y
    df = vaex.concat([df1, df2])
    assert df.w.tolist() == [1 + 2, 2 + 3]
    assert df.z.tolist() == [1 + 2, 2 * 3]
Esempio n. 11
0
def test_concat_timestamp():
    df1 = pa.Table.from_arrays(
        [pa.array(['2020-01-31', '2020-01-31']).cast('timestamp[us]')],
        names=['ts'])
    df2 = pa.Table.from_arrays(
        [pa.array(['2020-12-31', '2020-12-31']).cast('timestamp[ns]')],
        names=['ts'])
    df1_vx = vaex.from_arrow_table(df1)
    df2_vx = vaex.from_arrow_table(df2)
    df = vaex.concat([df1_vx, df2_vx])
    assert df.ts.tolist() == df1['ts'].to_pylist() + df2['ts'].to_pylist()
    assert df.ts.dtype.internal == pa.timestamp('ns')
Esempio n. 12
0
def test_concat_mixed_types():
    x1 = np.zeros(3) + np.nan
    x2 = vaex.string_column(['hi', 'there'])
    df1 = vaex.from_arrays(x=x1)
    df2 = vaex.from_arrays(x=x2)
    df = vaex.concat([df1, df2])
    assert df2.x.dtype == df.x.dtype, "expect 'upcast' to string"
    assert df[:2].x.tolist() == ['nan', 'nan']
    assert df[1:4].x.tolist() == ['nan', 'nan', 'hi']
    assert df[2:4].x.tolist() == ['nan', 'hi']
    assert df[3:4].x.tolist() == ['hi']
    assert df[3:5].x.tolist() == ['hi', 'there']
Esempio n. 13
0
def test_concat(df_file, tmpdir):
    path = tmpdir / 'test2.hdf5'
    df_file[['x']].export(path)
    df_concat = vaex.open(path)
    df = vaex.concat([df_file, df_concat])
    assert len(pickle.dumps(df)) < 2000
    df2 = pickle.loads(pickle.dumps(df))
    assert len(df) == len(df_file) * 2
    assert len(df2) == len(df_file) * 2
    # assert df.compare(df2) == ([], [], [], [])
    assert df2.x.count() == len(df_file) * 2, 'x is repeated'
    assert df2.x.sum() == df_file.x.sum() * 2, 'x is repeated'
    assert df2.y.sum() == df_file.y.sum(), 'y is not repeated'
Esempio n. 14
0
def test_from_csv():
    # can read with default options
    df = vaex.from_csv(csv_path, copy_index=True)
    _assert_csv_content(df, with_index=True)

    # can read an empty CSV
    df = vaex.from_csv(os.path.join(path, 'data', 'empty.csv'))
    assert len(df) == 0

    # can read as chunks iterator
    df_iterator = vaex.from_csv(csv_path, chunk_size=1)
    df1 = next(df_iterator)
    assert len(df1) == 1
    df2, df3 = next(df_iterator), next(df_iterator)
    with pytest.raises(StopIteration):
        next(df_iterator)
    _assert_csv_content(vaex.concat([df1, df2, df3]))
Esempio n. 15
0
def _from_dataframe_to_vaex(df: DataFrameObject) -> vaex.dataframe.DataFrame:
    """
    Note: we need to implement/test support for bit/byte masks, chunk handling, etc.
    """
    # Iterate through the chunks
    dataframe = []
    _buffers = []
    for chunk in df.get_chunks():

        # We need a dict of columns here, with each column being an expression.
        columns = dict()
        _k = _DtypeKind
        _buffers_chunks = []  # hold on to buffers, keeps memory alive
        for name in chunk.column_names():
            if not isinstance(name, str):
                raise ValueError(f"Column {name} is not a string")
            if name in columns:
                raise ValueError(f"Column {name} is not unique")

            col = chunk.get_column_by_name(name)
            if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
                # Simple numerical or bool dtype, turn into arrow array
                columns[name], _buf = convert_column_to_ndarray(col)
            elif col.dtype[0] == _k.CATEGORICAL:
                columns[name], _buf = convert_categorical_column(col)
            elif col.dtype[0] == _k.STRING:
                columns[name], _buf = convert_string_column(col)
            else:
                raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")

            _buffers_chunks.append(_buf)

        dataframe.append(vaex.from_dict(columns))
        # chunk buffers are added to list of all buffers
        _buffers.append(_buffers_chunks)

    if df.num_chunks() == 1:
        _buffers = _buffers[0]

    df_new = vaex.concat(dataframe)
    df_new._buffers = _buffers
    return df_new
Esempio n. 16
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x*2})
    assert df.get_column_names() == ['X!1', 'class']
    assert df.get_column_names(alias=False) != ['X!1', 'class']
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1']*2).tolist() == (x*2).tolist()
    assert (df['class']).tolist() == (x*2).tolist()
    assert 'X!1' in df._column_aliases
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()
    assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist()

    df_concat = vaex.concat([df, df])
    assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
Esempio n. 17
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x*2})
    assert str(df['X!1']) != 'X!1', "invalid identifier cannot be an expression"
    assert str(df['class']) != 'class', "keyword cannot be an expression"
    assert df.get_column_names() == ['X!1', 'class']
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1']*2).tolist() == (x*2).tolist()
    assert (df['class']).tolist() == (x*2).tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()
    assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist()

    df_concat = vaex.concat([df, df])
    assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
Esempio n. 18
0
    def predict(self, instances, **kwargs):

        if isinstance(instances[0], list):
            data = np.asarray(instances).T
            df = vaex.from_arrays(Arrival_Time=data[0],
                                  Creation_Time=data[1],
                                  x=data[2],
                                  y=data[3],
                                  z=data[4])

        elif isinstance(instances[0], dict):
            dfs = []
            for instance in instances:
                df = vaex.from_dict(instance)
                dfs.append(df)
            df = vaex.concat(dfs)

        else:
            return ['invalid input format']

        df.state_set(self.state, set_filter=False)
        return df.pred_name.tolist()
Esempio n. 19
0
def predict(data: Data):
    instances = data.instances

    if isinstance(instances[0], list):
        data = np.asarray(instances).T
        df = vaex.from_arrays(Arrival_Time=data[0],
                              Creation_Time=data[1],
                              x=data[2],
                              y=data[3],
                              z=data[4])

    elif isinstance(instances[0], dict):
        dfs = []
        for instance in instances:
            df = vaex.from_dict(instance)
            dfs.append(df)
        df = vaex.concat(dfs)

    else:
        return {'predictions': 'invalid input format'}

    df.state_set(global_items['state'], set_filter=False)
    return {'predictions': df.pred_name.tolist()}
Esempio n. 20
0
def test_concat_strict(df_factory):
    df1 = df_factory(x=[1, 2])
    df2 = df_factory(x=[3, None, 4])
    df = vaex.concat([df1, df2], resolver='strict')
    assert df.x.tolist() == [1, 2, 3, None, 4]
Esempio n. 21
0
def test_concat_arrow_strings():
    df1 = vaex.from_arrays(x=vaex.string_column(['aap', 'noot', 'mies']))
    df2 = vaex.from_arrays(x=vaex.string_column(['a', 'b', 'c']))
    df = vaex.concat([df1, df2])
    assert df.data_type('x') == df1.data_type('x')
    assert df.x.tolist() == ['aap', 'noot', 'mies', 'a', 'b', 'c']
Esempio n. 22
0
 def concat(*types):
     dfs = [
         vaex.from_arrays(x=np.arange(3, dtype=dtype)) for dtype in types
     ]
     dataset_concat = vaex.concat(dfs)
     return dataset_concat
Esempio n. 23
0
def test_concat_chunk_iterator(l1, l2):
    i1 = 0
    i2 = i1 + l1
    i3 = i2 + l2
    x = np.arange(10)
    y = x**2
    g = x // 3
    ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g)
    df_original = df = vaex.from_dataset(ds)
    df1 = df[i1:i2]
    df2 = df[i2:i3]
    df3 = df[i3:]
    df = vaex.concat([df1, df2, df3])
    ds_full = ds = df.dataset

    # very similar to the arrow/datase_test.py parquet test
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    # no columns
    iter = ds.chunk_iterator([], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2

    ds = ds[1:10]
    assert 'x' in ds
    assert ds.row_count == 9
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        if i == 4:
            assert i1 == 8
            assert i2 == 9
        else:
            assert i1 == i * 2
            assert i2 == (i + 1) * 2
        # chunks = chunks
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    ds = ds[1:9]
    assert ds.row_count == 8
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    # no columns
    iter = ds.chunk_iterator([], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2

    # again, but here we skip of total of a chunk_size at the end
    ds = ds_full[:8]
    # import pdb; pdb.set_trace()
    assert ds.row_count == 8
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    for i in range(9):
        for j in range(i + 1, 10):
            ds = ds_full.slice(i, j)
            values = []
            for i1, i2, chunks in ds.chunk_iterator(['x']):
                values.extend(chunks['x'].tolist())
            assert x[i:j].tolist() == values

    assert df.x.tolist() == x.tolist()
    assert df.g.tolist() == g.tolist()

    ds_dropped = ds.dropped('x')
    assert 'x' not in ds_dropped
Esempio n. 24
0
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'``
                    The conversion is skipped if the input file or conversion argument did not change.
    :param progress: (_Only applies when convert is not False_) {progress}
    :param bool shuffle: shuffle converted DataFrame or not
    :param dict fs_options: Extra arguments passed to an optional file system if needed:
        * Amazon AWS S3
            * `anonymous` - access file without authentication (public files)
            * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file
            * `secret_key` - AWS secret key, similar to `access_key`
            * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
            * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided.
            * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio
        * Google Cloud Storage
            * :py:class:`gcsfs.core.GCSFileSystem`
        In addition you can pass the boolean "cache" option.
    :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table".
    :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty.
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    Cloud storage support:

    Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access
    is as fast as native disk access.

    The following common fs_options are used for S3 access:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)

    All fs_options can also be encoded in the file path as a query string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}})
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}})
    >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject')

    Google Cloud Storage support:

    The following fs_options are used for GCP access:

     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}})
    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        if not isinstance(path, (list, tuple)):
            # remote and clusters only support single path, not a list
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            if path.startswith("http://") or path.startswith("ws://") or \
                path.startswith("vaex+wss://") or path.startswith("wss://") or \
               path.startswith("vaex+http://") or path.startswith("vaex+ws://"):
                server, name = path.rsplit("/", 1)
                url = urlparse(path)
                if '?' in name:
                    name = name[:name.index('?')]
                extra_args = {key: values[0] for key, values in parse_qs(url.query).items()}
                if 'token' in extra_args:
                    kwargs['token'] = extra_args['token']
                if 'token_trusted' in extra_args:
                    kwargs['token_trusted'] = extra_args['token_trusted']
                client = vaex.connect(server, **kwargs)
                return client[name]
            if path.startswith("cluster"):
                import vaex.enterprise.distributed
                return vaex.enterprise.distributed.open(path, *args, **kwargs)

        import vaex.file
        import glob
        if isinstance(path, str):
            paths = [path]
        else:
            paths = path
        filenames = []
        for path in paths:
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            naked_path, options = vaex.file.split_options(path)
            if glob.has_magic(naked_path):
                filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs))))
            else:
                filenames.append(path)
        df = None
        if len(filenames) == 0:
            raise IOError(f'File pattern did not match anything {path}')
        filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
        filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False)
        if len(filenames) == 1:
            path = filenames[0]
            # # naked_path, _ = vaex.file.split_options(path, fs_options)
            _, ext, _ = vaex.file.split_ext(path)
            if ext == '.csv':  # special case for csv
                return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
            if convert:
                path_output = convert if isinstance(convert, str) else filename_hdf5
                vaex.convert.convert(
                    path_input=path, fs_options_input=fs_options, fs_input=fs,
                    path_output=path_output, fs_options_output=fs_options, fs_output=fs,
                    progress=progress,
                    *args, **kwargs
                )
                ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs)
            else:
                ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs)
            df = vaex.from_dataset(ds)
            if df is None:
                if os.path.exists(path):
                    raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
        elif len(filenames) > 1:
            if convert not in [True, False]:
                filename_hdf5 = convert
            else:
                filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
            if os.path.exists(filename_hdf5) and convert:  # also check mtime
                df = vaex.open(filename_hdf5)
            else:
                dfs = []
                for filename in filenames:
                    dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs))
                df = vaex.concat(dfs)
                if convert:
                    if shuffle:
                        df = df.shuffle()
                    df.export_hdf5(filename_hdf5, progress=progress)
                    df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logger.exception("error opening %r" % path)
        raise
Esempio n. 25
0
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    S3 support:

    Vaex supports streaming of hdf5 files from Amazon AWS object storage S3.
    Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access
    is as fast as native disk access. The following url parameters control S3 options:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)
     * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem`

    All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True)  # Note that anon is a boolean, not the string 'true'
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile')

    GCS support:
    Vaex supports streaming of hdf5 files from Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access
    is as fast as native disk access. The following url parameters control GCS options:
     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        path = vaex.file.stringyfy(path)
        if path in aliases:
            path = aliases[path]
        path = vaex.file.stringyfy(path)
        if path.startswith("http://") or path.startswith("ws://") or \
           path.startswith("vaex+http://") or path.startswith("vaex+ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {
                key: values[0]
                for key, values in parse_qs(url.query).items()
            }
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            client = vaex.connect(server, **kwargs)
            return client[name]
        if path.startswith("cluster"):
            import vaex.enterprise.distributed
            return vaex.enterprise.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, str):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                naked_path, options = vaex.file.split_options(path)
                if glob.has_magic(naked_path):
                    filenames.extend(
                        list(sorted(vaex.file.glob(path, **kwargs))))
                else:
                    filenames.append(path)
            df = None
            if len(filenames) == 0:
                raise IOError(f'File pattern did not match anything {path}')
            filename_hdf5 = vaex.convert._convert_name(filenames,
                                                       shuffle=shuffle)
            filename_hdf5_noshuffle = vaex.convert._convert_name(filenames,
                                                                 shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                # # naked_path, _ = vaex.file.split_options(path, fs_options)
                _, ext, _ = vaex.file.split_ext(path)
                if ext == '.csv':  # special case for csv
                    return vaex.from_csv(path,
                                         fs_options=fs_options,
                                         convert=convert,
                                         **kwargs)
                if convert:
                    path_output = convert if isinstance(convert,
                                                        str) else filename_hdf5
                    vaex.convert.convert(path_input=path,
                                         fs_options_input=fs_options,
                                         path_output=path_output,
                                         fs_options_output=fs_options,
                                         *args,
                                         **kwargs)
                    ds = vaex.dataset.open(path_output, fs_options=fs_options)
                else:
                    ds = vaex.dataset.open(path, fs_options=fs_options)
                df = vaex.from_dataset(ds)
                if df is None:
                    if os.path.exists(path):
                        raise IOError(
                            'Could not open file: {}, did you install vaex-hdf5? Is the format supported?'
                            .format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = vaex.convert._convert_name(filenames,
                                                               shuffle=shuffle)
                if os.path.exists(
                        filename_hdf5) and convert:  # also check mtime
                    df = vaex.open(filename_hdf5)
                else:
                    dfs = []
                    for filename in filenames:
                        dfs.append(
                            vaex.open(filename,
                                      convert=bool(convert),
                                      shuffle=shuffle,
                                      **kwargs))
                    df = vaex.concat(dfs)
                    if convert:
                        if shuffle:
                            df = df.shuffle()
                        df.export_hdf5(filename_hdf5)
                        df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Esempio n. 26
0
def compute_flow_data(days, hours, zone):
    logger.info("Compute: flow data: days=%r hours=%r zone=%r", days, hours,
                zone)
    df, selection = create_selection(days, hours)
    df.select(df.pickup_zone == zone, mode='and')
    selection = True
    df_flow_zone = df.groupby(
        [df.pickup_zone, df.dropoff_zone],
        agg={'count_trips': vaex.agg.count(selection=selection)})
    # sort descending so we can take the top N
    df_flow_zone = df_flow_zone.sort('count_trips', ascending=False)

    df_flow_zone['pickup_borough'] = df_flow_zone.pickup_zone.map(
        zone_index_to_borough_index)
    df_flow_zone['dropoff_borough'] = df_flow_zone.dropoff_zone.map(
        zone_index_to_borough_index)

    pickup_zone = zone
    pickup_borough = zone_index_to_borough_index[pickup_zone]

    # Now to include the total count of all trips for zones that are not the top N
    # only trips leaving from this zone and to a different borough
    df_outflow_zone = df_flow_zone[(df_flow_zone.pickup_zone == pickup_zone)]
    df_outflow_zone = df_outflow_zone[
        df_outflow_zone.dropoff_borough != pickup_borough]

    df_outflows_top = []
    df_outflows_rest = []

    for dropoff_borough in range(6):
        if dropoff_borough == pickup_borough:
            continue
        # outflow from this zone, to a particular borough
        df_outflow_zone_borough = df_outflow_zone[
            df_outflow_zone.dropoff_borough == dropoff_borough]
        if len(df_outflow_zone_borough):
            n_max = min(len(df_outflow_zone_borough), n_largest)
            # top N zones of outflow from this zone, to a particular borough
            df_outflows_top.append(df_outflow_zone_borough[:n_max])

            if len(df_outflow_zone_borough) > n_largest:
                count_other = df_outflow_zone_borough[n_largest:][
                    'count_trips'].sum()

                # rest of the outflow from this zone, to a particular borough
                df_outflows_rest.append(
                    vaex.from_scalars(pickup_borough=pickup_borough,
                                      dropoff_borough=dropoff_borough,
                                      dropoff_zone=len(zone_index_to_name) +
                                      dropoff_borough,
                                      count_trips=count_other))

    df_outflow_top = vaex.concat(df_outflows_top)
    df_outflow_borough = df_outflow_zone.groupby(
        ['pickup_borough', 'dropoff_borough'],
        agg={'count_trips': vaex.agg.sum('count_trips')})
    if df_outflows_rest:
        df_outflow_rest = vaex.concat(df_outflows_rest)
    else:
        # create an empy dataframe with the same schema to make the rest of the code simpler
        df_outflow_rest = vaex.from_scalars(pickup_borough=-1,
                                            dropoff_borough=-1,
                                            dropoff_zone=-1,
                                            count_trips=-1)[:0]

    # return as dict and lists so it can be serialized by the memoize decorator
    flow_data = dict(
        outflow_top=df_outflow_top.to_dict(array_type='list'),
        outflow_rest=df_outflow_rest.to_dict(array_type='list'),
        outflow_borough=df_outflow_borough.to_dict(array_type='list'))
    return flow_data