Esempio n. 1
0
def test_passes_two_datasets_different_expressions():
    x = np.array([2.])
    y = x**2
    dataset = vaex.dataset.DatasetArrays(x=x, y=y)
    df1 = vaex.from_dataset(dataset)
    df2 = vaex.from_dataset(dataset)
    df1['a'] = 'x * y'
    df2['b'] = 'x + y'
    executor = df1.executor
    executor.passes = 0
    s1 = df1.sum('a', delay=True)
    s2 = df2.sum('b', delay=True)
    df1.execute()
    assert executor.passes == 1
    assert s1.get() == 2 * 4
    assert s2.get() == 2 + 4
Esempio n. 2
0
async def get_df(name):
    if name not in datasets:
        raise HTTPException(status_code=404,
                            detail=f"dataset {name!r} not found")
    # for now we only allow 1 request to execute at a time
    async with global_lock:
        yield vaex.from_dataset(datasets[name])
Esempio n. 3
0
        def process(args):
            counts, arrays = args
            # arrays = {key: value.get() for key, value in arrays.items()}
            # take out the edges
            arrays = {
                key: vaex.utils.extract_central_part(value)
                for key, value in arrays.items()
            }
            counts = vaex.utils.extract_central_part(counts)

            # make sure we respect the sorting
            def sort(ar):
                for i, by in list(enumerate(self.by))[::-1]:
                    sort_indices = by.sort_indices
                    if sort_indices is not None:
                        # if sort_indices come from arrow, it will be uint64
                        # which np.take does not like
                        sort_indices = vaex.array_types.to_numpy(sort_indices)
                        if sort_indices.dtype == np.dtype("uint64"):
                            sort_indices = sort_indices.astype("int64")
                        ar = np.take(ar, sort_indices, axis=i)
                return ar

            arrays = {key: sort(value) for key, value in arrays.items()}

            if self.combine and self.expand and isinstance(
                    self.by[0], GrouperCombined):
                assert len(self.by) == 1
                values = self.by[0].bin_values
                columns = {
                    field.name: ar
                    for field, ar in zip(values.type, values.flatten())
                }
                for key, value in arrays.items():
                    assert value.ndim == 1
                    columns[key] = value
            else:
                counts = sort(counts)
                mask = counts > 0
                columns = {}
                for by, indices in zip(self.by, np.where(mask)):
                    columns[by.label] = by.bin_values.take(indices)
                if mask.sum() == mask.size:
                    # if we want all, just take it all
                    # should be faster
                    for key, value in arrays.items():
                        columns[key] = value.ravel()
                else:
                    for key, value in arrays.items():
                        columns[key] = value[mask]
            dataset_arrays = vaex.dataset.DatasetArrays(columns)
            dataset = DatasetGroupby(dataset_arrays,
                                     self.df,
                                     self.by_original,
                                     actions,
                                     combine=self.combine,
                                     expand=self.expand,
                                     sort=self.sort)
            df_grouped = vaex.from_dataset(dataset)
            return df_grouped
Esempio n. 4
0
def add_graphql():
    import vaex.graphql
    import graphene
    from starlette.graphql import GraphQLApp
    dfs = {name: vaex.from_dataset(ds) for name, ds in datasets.items()}
    Query = vaex.graphql.create_query(dfs)
    schema = graphene.Schema(query=Query)
    app.add_route("/graphql", GraphQLApp(schema=schema))
Esempio n. 5
0
 def cached_output(*args, **kwargs):
     ds = vaex.dataset.open(path_input,
                            fs_options=fs_options_input,
                            *args,
                            **kwargs)
     if ds is not None:
         df = vaex.from_dataset(ds)
         df.export(path_output)
Esempio n. 6
0
def test_passes_two_datasets_different_vars():
    x = np.array([2.])
    y = x**2
    dataset = vaex.dataset.DatasetArrays(x=x, y=y)
    df1 = vaex.from_dataset(dataset)
    df2 = vaex.from_dataset(dataset)
    df1.variables['a'] = 1
    df2.variables['a'] = 2
    df1['z'] = 'x + y * a'
    df2['z'] = 'x + y * a'
    executor = df1.executor
    executor.passes = 0
    s1 = df1.sum('z', delay=True)
    s2 = df2.sum('z', delay=True)
    df1.execute()
    assert executor.passes == 1
    assert s1.get() == 2 + 4 * 1
    assert s2.get() == 2 + 4 * 2
Esempio n. 7
0
    def agg(self, actions):
        # TODO: this basically forms a cartesian product, we can do better, use a
        # 'multistage' hashmap
        arrays = super(GroupBy, self)._agg(actions)
        has_non_existing_pairs = len(self.by) > 1
        # we don't want non-existing pairs (e.g. Amsterdam in France does not exist)
        counts = self.counts
         # nobody wanted to know count*, but we need it if we included non-existing pairs
        if has_non_existing_pairs and counts is None:
            # TODO: it seems this path is never tested
            count_agg = vaex.agg.count(edges=True)
            counts = self.df._agg(count_agg, self.binners, delay=_USE_DELAY)
        self.df.execute()
        if _USE_DELAY:
            arrays = {key: value.get() for key, value in arrays.items()}
            if has_non_existing_pairs:
                counts = counts.get()
        # take out the edges
        arrays = {key: vaex.utils.extract_central_part(value) for key, value in arrays.items()}
        if has_non_existing_pairs:
            counts = vaex.utils.extract_central_part(counts)

        # make sure we respect the sorting
        sorting = tuple(by.sort_indices if by.sort_indices is not None else slice(None) for by in self.by)
        arrays = {key: value[sorting] for key, value in arrays.items()}

        if self.combine and self.expand and isinstance(self.by[0], GrouperCombined):
            assert len(self.by) == 1
            values = self.by[0].bin_values
            columns = {field.name: ar for field, ar in zip(values.type, values.flatten())}
            for key, value in arrays.items():
                assert value.ndim == 1
                columns[key] = value
        else:
            if has_non_existing_pairs:
                counts = counts[sorting]
                mask = counts > 0
                coords = [coord[mask] for coord in np.meshgrid(*self._coords1d, indexing='ij')]
                columns = {by.label: coord for by, coord in zip(self.by, coords)}
                for key, value in arrays.items():
                    columns[key] = value[mask]
            else:
                columns = {by.label: coord for by, coord in zip(self.by, self._coords1d)}
                for key, value in arrays.items():
                    assert value.ndim == 1
                    columns[key] = value
        dataset_arrays = vaex.dataset.DatasetArrays(columns)
        dataset = DatasetGroupby(dataset_arrays, self.df, self.by_original, actions, combine=self.combine, expand=self.expand, sort=self.sort)
        df_grouped = vaex.from_dataset(dataset)
        return df_grouped
Esempio n. 8
0
def update_service(dfs=None):
    global service_threaded
    import vaex.server.service
    if dfs is None:
        dfs = {
            name: vaex.from_dataset(dataset)
            for name, dataset in datasets.items()
        }

    service_bare = vaex.server.service.Service(dfs)
    server_thread_count = 1
    threads_per_job = 32
    service_threaded = vaex.server.service.AsyncThreadedService(
        service_bare, server_thread_count, threads_per_job)
Esempio n. 9
0
def test_concat_chunk_iterator(l1, l2):
    i1 = 0
    i2 = i1 + l1
    i3 = i2 + l2
    x = np.arange(10)
    y = x**2
    g = x // 3
    ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g)
    df_original = df = vaex.from_dataset(ds)
    df1 = df[i1:i2]
    df2 = df[i2:i3]
    df3 = df[i3:]
    df = vaex.concat([df1, df2, df3])
    ds_full = ds = df.dataset

    # very similar to the arrow/datase_test.py parquet test
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    # no columns
    iter = ds.chunk_iterator([], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2

    ds = ds[1:10]
    assert 'x' in ds
    assert ds.row_count == 9
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        if i == 4:
            assert i1 == 8
            assert i2 == 9
        else:
            assert i1 == i * 2
            assert i2 == (i + 1) * 2
        # chunks = chunks
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    ds = ds[1:9]
    assert ds.row_count == 8
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    # no columns
    iter = ds.chunk_iterator([], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2

    # again, but here we skip of total of a chunk_size at the end
    ds = ds_full[:8]
    # import pdb; pdb.set_trace()
    assert ds.row_count == 8
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].tolist() == x[i1:i2].tolist()
        chunks['y'].tolist() == y[i1:i2].tolist()

    for i in range(9):
        for j in range(i + 1, 10):
            ds = ds_full.slice(i, j)
            values = []
            for i1, i2, chunks in ds.chunk_iterator(['x']):
                values.extend(chunks['x'].tolist())
            assert x[i:j].tolist() == values

    assert df.x.tolist() == x.tolist()
    assert df.g.tolist() == g.tolist()

    ds_dropped = ds.dropped('x')
    assert 'x' not in ds_dropped
Esempio n. 10
0
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'``
                    The conversion is skipped if the input file or conversion argument did not change.
    :param progress: (_Only applies when convert is not False_) {progress}
    :param bool shuffle: shuffle converted DataFrame or not
    :param dict fs_options: Extra arguments passed to an optional file system if needed:
        * Amazon AWS S3
            * `anonymous` - access file without authentication (public files)
            * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file
            * `secret_key` - AWS secret key, similar to `access_key`
            * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
            * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided.
            * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio
        * Google Cloud Storage
            * :py:class:`gcsfs.core.GCSFileSystem`
        In addition you can pass the boolean "cache" option.
    :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table".
    :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty.
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    Cloud storage support:

    Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access
    is as fast as native disk access.

    The following common fs_options are used for S3 access:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)

    All fs_options can also be encoded in the file path as a query string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}})
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}})
    >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject')

    Google Cloud Storage support:

    The following fs_options are used for GCP access:

     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}})
    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        if not isinstance(path, (list, tuple)):
            # remote and clusters only support single path, not a list
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            if path.startswith("http://") or path.startswith("ws://") or \
                path.startswith("vaex+wss://") or path.startswith("wss://") or \
               path.startswith("vaex+http://") or path.startswith("vaex+ws://"):
                server, name = path.rsplit("/", 1)
                url = urlparse(path)
                if '?' in name:
                    name = name[:name.index('?')]
                extra_args = {key: values[0] for key, values in parse_qs(url.query).items()}
                if 'token' in extra_args:
                    kwargs['token'] = extra_args['token']
                if 'token_trusted' in extra_args:
                    kwargs['token_trusted'] = extra_args['token_trusted']
                client = vaex.connect(server, **kwargs)
                return client[name]
            if path.startswith("cluster"):
                import vaex.enterprise.distributed
                return vaex.enterprise.distributed.open(path, *args, **kwargs)

        import vaex.file
        import glob
        if isinstance(path, str):
            paths = [path]
        else:
            paths = path
        filenames = []
        for path in paths:
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            naked_path, options = vaex.file.split_options(path)
            if glob.has_magic(naked_path):
                filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs))))
            else:
                filenames.append(path)
        df = None
        if len(filenames) == 0:
            raise IOError(f'File pattern did not match anything {path}')
        filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
        filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False)
        if len(filenames) == 1:
            path = filenames[0]
            # # naked_path, _ = vaex.file.split_options(path, fs_options)
            _, ext, _ = vaex.file.split_ext(path)
            if ext == '.csv':  # special case for csv
                return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
            if convert:
                path_output = convert if isinstance(convert, str) else filename_hdf5
                vaex.convert.convert(
                    path_input=path, fs_options_input=fs_options, fs_input=fs,
                    path_output=path_output, fs_options_output=fs_options, fs_output=fs,
                    progress=progress,
                    *args, **kwargs
                )
                ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs)
            else:
                ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs)
            df = vaex.from_dataset(ds)
            if df is None:
                if os.path.exists(path):
                    raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
        elif len(filenames) > 1:
            if convert not in [True, False]:
                filename_hdf5 = convert
            else:
                filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
            if os.path.exists(filename_hdf5) and convert:  # also check mtime
                df = vaex.open(filename_hdf5)
            else:
                dfs = []
                for filename in filenames:
                    dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs))
                df = vaex.concat(dfs)
                if convert:
                    if shuffle:
                        df = df.shuffle()
                    df.export_hdf5(filename_hdf5, progress=progress)
                    df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logger.exception("error opening %r" % path)
        raise
Esempio n. 11
0
async def get_df(name):
    if name not in datasets:
        raise HTTPException(status_code=404,
                            detail=f"dataset {name!r} not found")
    yield vaex.from_dataset(datasets[name])
Esempio n. 12
0
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    S3 support:

    Vaex supports streaming of hdf5 files from Amazon AWS object storage S3.
    Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access
    is as fast as native disk access. The following url parameters control S3 options:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)
     * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem`

    All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True)  # Note that anon is a boolean, not the string 'true'
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile')

    GCS support:
    Vaex supports streaming of hdf5 files from Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access
    is as fast as native disk access. The following url parameters control GCS options:
     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        path = vaex.file.stringyfy(path)
        if path in aliases:
            path = aliases[path]
        path = vaex.file.stringyfy(path)
        if path.startswith("http://") or path.startswith("ws://") or \
           path.startswith("vaex+http://") or path.startswith("vaex+ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {
                key: values[0]
                for key, values in parse_qs(url.query).items()
            }
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            client = vaex.connect(server, **kwargs)
            return client[name]
        if path.startswith("cluster"):
            import vaex.enterprise.distributed
            return vaex.enterprise.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, str):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                naked_path, options = vaex.file.split_options(path)
                if glob.has_magic(naked_path):
                    filenames.extend(
                        list(sorted(vaex.file.glob(path, **kwargs))))
                else:
                    filenames.append(path)
            df = None
            if len(filenames) == 0:
                raise IOError(f'File pattern did not match anything {path}')
            filename_hdf5 = vaex.convert._convert_name(filenames,
                                                       shuffle=shuffle)
            filename_hdf5_noshuffle = vaex.convert._convert_name(filenames,
                                                                 shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                # # naked_path, _ = vaex.file.split_options(path, fs_options)
                _, ext, _ = vaex.file.split_ext(path)
                if ext == '.csv':  # special case for csv
                    return vaex.from_csv(path,
                                         fs_options=fs_options,
                                         convert=convert,
                                         **kwargs)
                if convert:
                    path_output = convert if isinstance(convert,
                                                        str) else filename_hdf5
                    vaex.convert.convert(path_input=path,
                                         fs_options_input=fs_options,
                                         path_output=path_output,
                                         fs_options_output=fs_options,
                                         *args,
                                         **kwargs)
                    ds = vaex.dataset.open(path_output, fs_options=fs_options)
                else:
                    ds = vaex.dataset.open(path, fs_options=fs_options)
                df = vaex.from_dataset(ds)
                if df is None:
                    if os.path.exists(path):
                        raise IOError(
                            'Could not open file: {}, did you install vaex-hdf5? Is the format supported?'
                            .format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = vaex.convert._convert_name(filenames,
                                                               shuffle=shuffle)
                if os.path.exists(
                        filename_hdf5) and convert:  # also check mtime
                    df = vaex.open(filename_hdf5)
                else:
                    dfs = []
                    for filename in filenames:
                        dfs.append(
                            vaex.open(filename,
                                      convert=bool(convert),
                                      shuffle=shuffle,
                                      **kwargs))
                    df = vaex.concat(dfs)
                    if convert:
                        if shuffle:
                            df = df.shuffle()
                        df.export_hdf5(filename_hdf5)
                        df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Esempio n. 13
0
        def process(args):
            counts, arrays = args
            logger.info(f"aggregated on grid, constructing dataframe...")
            if counts is not None:
                for name, array in arrays.items():
                    if array.shape != counts.shape:
                        raise RuntimeError(f'{array} {name} has shape {array.shape} while we expected {counts.shape}')

            arrays = {key: self._extract_center(value) for key, value in arrays.items()}
            if not self.dense:
                counts = self._extract_center(counts)

            # make sure we respect the sorting
            def sort(ar):
                for i, by in list(enumerate(self.by))[::-1]:
                    sort_indices = by.sort_indices
                    if sort_indices is not None:
                        # if sort_indices come from arrow, it will be uint64
                        # which np.take does not like
                        sort_indices = vaex.array_types.to_numpy(sort_indices)
                        if sort_indices.dtype == np.dtype("uint64"):
                            sort_indices = sort_indices.astype("int64")
                        ar = np.take(ar, sort_indices, axis=i)
                return ar

            arrays = {key: sort(value) for key, value in arrays.items()}

            if self.combine and self.expand and isinstance(self.by[0], GrouperCombined):
                assert len(self.by) == 1
                values = self.by[0].bin_values
                columns = {field.name: ar for field, ar in zip(values.type, values.flatten())}
                for key, value in arrays.items():
                    assert value.ndim == 1
                    columns[key] = value
            else:
                columns = {}
                if self.dense:
                    if len(self.by) == 1:
                        for by in self.by:
                            columns[by.label] = by.bin_values
                    else:
                        array0 = arrays[list(arrays)[0]]
                        # similar to the where, this creates indices like [0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]
                        indices = [k.ravel() for k in np.mgrid[[slice(0, n) for n in array0.shape]]]
                        for by, index in zip(self.by, indices):
                            columns[by.label] = vaex.array_types.take(by.bin_values, index)

                else:
                    counts = sort(counts)
                    mask = counts > 0
                    for by, indices in zip(self.by, np.where(mask)):
                        columns[by.label] = by.bin_values.take(indices)
                if self.dense or mask.sum() == mask.size:
                    # if we want all, just take it all
                    # should be faster
                    for key, value in arrays.items():
                        columns[key] = value.ravel()
                else:
                    for key, value in arrays.items():
                        columns[key] = value[mask]
            logger.info(f"constructed dataframe")
            dataset_arrays = vaex.dataset.DatasetArrays(columns)
            dataset = DatasetGroupby(dataset_arrays, self.df, self.by_original, actions, combine=self.combine, expand=self.expand, sort=self.sort)
            df_grouped = vaex.from_dataset(dataset)
            return df_grouped
Esempio n. 14
0
def test_parquet(l1, l2, rebuild_dataset):
    i1 = 0
    i2 = i1 + l1
    i3 = i2 + l2

    x = np.arange(10)
    y = x**2
    g = x // 3
    ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g)
    df = vaex.from_dataset(ds)
    path1 = HERE.parent / 'data' / 'parquet' / 'test1.parquet'
    path2 = HERE.parent / 'data' / 'parquet' / 'test2.parquet'
    path3 = HERE.parent / 'data' / 'parquet' / 'test3.parquet'
    path1.parent.mkdir(exist_ok=True)
    # df.export(str(path))
    pyarrow.parquet.write_table(df[i1:i2].to_arrow_table(),
                                str(path1),
                                row_group_size=2)
    pyarrow.parquet.write_table(df[i2:i3].to_arrow_table(),
                                str(path2),
                                row_group_size=2)
    pyarrow.parquet.write_table(df[i3:].to_arrow_table(),
                                str(path3),
                                row_group_size=2)
    ds = vaex.arrow.dataset.open_parquet([str(path1), str(path2), str(path3)])
    # TODO: future PR will require this:
    df = vaex.from_dataset(ds)
    ds_full = ds = df.dataset

    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].to_pylist() == x[i1:i2].tolist()
        chunks['y'].to_pylist() == y[i1:i2].tolist()

    ds = ds[1:10]
    assert 'x' in ds
    assert ds.row_count == 9
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        if i == 4:
            assert i1 == 8
            assert i2 == 9
        else:
            assert i1 == i * 2
            assert i2 == (i + 1) * 2
        # chunks = chunks
        chunks['x'].to_pylist() == x[i1:i2].tolist()
        chunks['y'].to_pylist() == y[i1:i2].tolist()

    ds = ds[1:9]
    assert ds.row_count == 8
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2
        chunks['x'].to_pylist() == x[i1:i2].tolist()
        chunks['y'].to_pylist() == y[i1:i2].tolist()

    # empty columns
    iter = ds.chunk_iterator([], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i * 2
        assert i2 == (i + 1) * 2

    for i in range(9):
        for j in range(i + 1, 10):
            ds = ds_full.slice(i, j)
            values = []
            for i1, i2, chunks in ds.chunk_iterator(['x']):
                values.extend(chunks['x'].to_pylist())
            assert x[i:j].tolist() == values

    assert df.x.tolist() == x.tolist()
    assert df.g.tolist() == g.tolist()
    # ds.chunk_size = 4

    ds_dropped = ds.dropped('x')
    assert 'x' not in ds_dropped

    rebuild_dataset(ds).hashed() == ds.hashed()
Esempio n. 15
0
def test_parquet():
    x = np.arange(10)
    y = x**2
    g = x // 3
    ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g)
    df = vaex.from_dataset(ds)
    path = HERE.parent / 'data' / 'parquet' / 'test.parquet'
    path.parent.mkdir(exist_ok=True)
    # df.export(str(path))
    pyarrow.parquet.write_table(df.to_arrow_table(), str(path), row_group_size=2)

    df = vaex.open(str(path))
    ds_full = ds = df.dataset

    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        assert i1 == i*2
        assert i2 == (i + 1) * 2
        chunks['x'].to_pylist() == x[i1:i2].tolist()
        chunks['y'].to_pylist() == y[i1:i2].tolist()

    ds = ds[1:10]
    assert 'x' in ds
    assert ds.row_count == 9
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(5):
        i1, i2, chunks = next(iter)
        if i == 4:
            assert i1 == 8
            assert i2 == 9
        else:
            assert i1 == i*2
            assert i2 == (i + 1) * 2
        # chunks = chunks
        chunks['x'].to_pylist() == x[i1:i2].tolist()
        chunks['y'].to_pylist() == y[i1:i2].tolist()

    ds = ds[1:9]
    assert ds.row_count == 8
    iter = ds.chunk_iterator(['x', 'y'], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i*2
        assert i2 == (i + 1) * 2
        chunks['x'].to_pylist() == x[i1:i2].tolist()
        chunks['y'].to_pylist() == y[i1:i2].tolist()

    # empty columns
    iter = ds.chunk_iterator([], chunk_size=2)
    for i in range(4):
        i1, i2, chunks = next(iter)
        assert i1 == i*2
        assert i2 == (i + 1) * 2

    for i in range(9):
        for j in range(i+1, 10):
            ds = ds_full.slice(i, j)
            values = []
            for i1, i2, chunks in ds.chunk_iterator(['x']):
                values.extend(chunks['x'].to_pylist())
            assert x[i:j].tolist() == values
            
    

    assert df.x.tolist() == x.tolist()
    assert df.g.tolist() == g.tolist()
    # ds.chunk_size = 4

    ds_dropped = ds.dropped('x')
    assert 'x' not in ds_dropped