Python connect Examples

Programming Language: Python

Namespace/Package Name: vaex

Method/Function: connect

Examples at hotexamples.com: 6

Python connect - 6 examples found. These are the top rated real world Python examples of vaex.connect extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def tornado_client(webserver, df_server, df_server_huge, event_loop):
    df = df_server
    df.drop('obj', inplace=True)
    df.drop('datetime', inplace=True)
    df.drop('timedelta', inplace=True)
    webserver.set_datasets([df, df_server_huge])
    client = vaex.connect("%s://localhost:%d" % (scheme, test_port))
    yield client
    client.close()

Example #2

Show file

def open(path,
         convert=False,
         shuffle=False,
         copy_index=False,
         *args,
         **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :param bool copy_index: copy index when source is read via pandas
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    S3 support:

    Vaex supports streaming of hdf5 files from Amazon AWS object storage S3.
    Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access
    is as fast as native disk access. The following url parameters control S3 options:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)
     * profile_name and other arguments are passed to :py:class:`s3fs.core.S3FileSystem`

    All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True)  # Note that anon is a boolean, not the string 'true'
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile_name=myprofile')

    GCS support:
    Vaex supports streaming of hdf5 files from Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access
    is as fast as native disk access. The following url parameters control GCS options:
     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith("ws://") or \
           path.startswith("vaex+http://") or path.startswith("vaex+ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {
                key: values[0]
                for key, values in parse_qs(url.query).items()
            }
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            client = vaex.connect(server, **kwargs)
            return client[name]
        if path.startswith("cluster"):
            import vaex.enterprise.distributed
            return vaex.enterprise.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, str):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                # TODO: can we do glob with s3?
                if path.startswith('s3://'):
                    filenames.append(path)
                elif path.startswith('gs://'):
                    filenames.append(path)
                else:
                    # sort to get predictable behaviour (useful for testing)
                    filenames.extend(list(sorted(glob.glob(path))))
            ds = None
            if len(filenames) == 0:
                raise IOError(
                    'Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                naked_path = path
                if '?' in naked_path:
                    naked_path = naked_path[:naked_path.index('?')]
                ext = os.path.splitext(naked_path)[1]
                if os.path.exists(
                        filename_hdf5) and convert:  # also check mtime?
                    ds = vaex.file.open(filename_hdf5)
                else:
                    if ext == '.csv' or naked_path.endswith(
                            ".csv.bz2"
                    ):  # special support for csv.. should probably approach it a different way
                        csv_convert = filename_hdf5 if convert else False
                        ds = from_csv(path,
                                      copy_index=copy_index,
                                      convert=csv_convert,
                                      **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                        if convert and ds:
                            ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                            ds = vaex.file.open(
                                filename_hdf5
                            )  # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError(
                            'Could not open file: {}, did you install vaex-hdf5? Is the format supported?'
                            .format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
                if os.path.exists(
                        filename_hdf5) and convert:  # also check mtime
                    ds = open(filename_hdf5)
                else:
                    # with ProcessPoolExecutor() as executor:
                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
                    dfs = []
                    for filename in filenames:
                        dfs.append(
                            open(filename,
                                 convert=bool(convert),
                                 shuffle=shuffle,
                                 **kwargs))
                    ds = concat(dfs)
                    if convert:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5)

        if ds is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return ds
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise

Example #3

Show file

def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'``
                    The conversion is skipped if the input file or conversion argument did not change.
    :param progress: (_Only applies when convert is not False_) {progress}
    :param bool shuffle: shuffle converted DataFrame or not
    :param dict fs_options: Extra arguments passed to an optional file system if needed:
        * Amazon AWS S3
            * `anonymous` - access file without authentication (public files)
            * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file
            * `secret_key` - AWS secret key, similar to `access_key`
            * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
            * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided.
            * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio
        * Google Cloud Storage
            * :py:class:`gcsfs.core.GCSFileSystem`
        In addition you can pass the boolean "cache" option.
    :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table".
    :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty.
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    Cloud storage support:

    Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access
    is as fast as native disk access.

    The following common fs_options are used for S3 access:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)

    All fs_options can also be encoded in the file path as a query string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}})
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}})
    >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject')

    Google Cloud Storage support:

    The following fs_options are used for GCP access:

     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}})
    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        if not isinstance(path, (list, tuple)):
            # remote and clusters only support single path, not a list
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            if path.startswith("http://") or path.startswith("ws://") or \
                path.startswith("vaex+wss://") or path.startswith("wss://") or \
               path.startswith("vaex+http://") or path.startswith("vaex+ws://"):
                server, name = path.rsplit("/", 1)
                url = urlparse(path)
                if '?' in name:
                    name = name[:name.index('?')]
                extra_args = {key: values[0] for key, values in parse_qs(url.query).items()}
                if 'token' in extra_args:
                    kwargs['token'] = extra_args['token']
                if 'token_trusted' in extra_args:
                    kwargs['token_trusted'] = extra_args['token_trusted']
                client = vaex.connect(server, **kwargs)
                return client[name]
            if path.startswith("cluster"):
                import vaex.enterprise.distributed
                return vaex.enterprise.distributed.open(path, *args, **kwargs)

        import vaex.file
        import glob
        if isinstance(path, str):
            paths = [path]
        else:
            paths = path
        filenames = []
        for path in paths:
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            naked_path, options = vaex.file.split_options(path)
            if glob.has_magic(naked_path):
                filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs))))
            else:
                filenames.append(path)
        df = None
        if len(filenames) == 0:
            raise IOError(f'File pattern did not match anything {path}')
        filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
        filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False)
        if len(filenames) == 1:
            path = filenames[0]
            # # naked_path, _ = vaex.file.split_options(path, fs_options)
            _, ext, _ = vaex.file.split_ext(path)
            if ext == '.csv':  # special case for csv
                return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
            if convert:
                path_output = convert if isinstance(convert, str) else filename_hdf5
                vaex.convert.convert(
                    path_input=path, fs_options_input=fs_options, fs_input=fs,
                    path_output=path_output, fs_options_output=fs_options, fs_output=fs,
                    progress=progress,
                    *args, **kwargs
                )
                ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs)
            else:
                ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs)
            df = vaex.from_dataset(ds)
            if df is None:
                if os.path.exists(path):
                    raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
        elif len(filenames) > 1:
            if convert not in [True, False]:
                filename_hdf5 = convert
            else:
                filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
            if os.path.exists(filename_hdf5) and convert:  # also check mtime
                df = vaex.open(filename_hdf5)
            else:
                dfs = []
                for filename in filenames:
                    dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs))
                df = vaex.concat(dfs)
                if convert:
                    if shuffle:
                        df = df.shuffle()
                    df.export_hdf5(filename_hdf5, progress=progress)
                    df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logger.exception("error opening %r" % path)
        raise

Example #4

Show file

File: __init__.py Project: yokeldd/vaex

def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    S3 support:

    Vaex supports streaming of hdf5 files from Amazon AWS object storage S3.
    Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access
    is as fast as native disk access. The following url parameters control S3 options:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)
     * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem`

    All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True)  # Note that anon is a boolean, not the string 'true'
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile')

    GCS support:
    Vaex supports streaming of hdf5 files from Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access
    is as fast as native disk access. The following url parameters control GCS options:
     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        path = vaex.file.stringyfy(path)
        if path in aliases:
            path = aliases[path]
        path = vaex.file.stringyfy(path)
        if path.startswith("http://") or path.startswith("ws://") or \
           path.startswith("vaex+http://") or path.startswith("vaex+ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {
                key: values[0]
                for key, values in parse_qs(url.query).items()
            }
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            client = vaex.connect(server, **kwargs)
            return client[name]
        if path.startswith("cluster"):
            import vaex.enterprise.distributed
            return vaex.enterprise.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, str):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                naked_path, options = vaex.file.split_options(path)
                if glob.has_magic(naked_path):
                    filenames.extend(
                        list(sorted(vaex.file.glob(path, **kwargs))))
                else:
                    filenames.append(path)
            df = None
            if len(filenames) == 0:
                raise IOError(f'File pattern did not match anything {path}')
            filename_hdf5 = vaex.convert._convert_name(filenames,
                                                       shuffle=shuffle)
            filename_hdf5_noshuffle = vaex.convert._convert_name(filenames,
                                                                 shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                # # naked_path, _ = vaex.file.split_options(path, fs_options)
                _, ext, _ = vaex.file.split_ext(path)
                if ext == '.csv':  # special case for csv
                    return vaex.from_csv(path,
                                         fs_options=fs_options,
                                         convert=convert,
                                         **kwargs)
                if convert:
                    path_output = convert if isinstance(convert,
                                                        str) else filename_hdf5
                    vaex.convert.convert(path_input=path,
                                         fs_options_input=fs_options,
                                         path_output=path_output,
                                         fs_options_output=fs_options,
                                         *args,
                                         **kwargs)
                    ds = vaex.dataset.open(path_output, fs_options=fs_options)
                else:
                    ds = vaex.dataset.open(path, fs_options=fs_options)
                df = vaex.from_dataset(ds)
                if df is None:
                    if os.path.exists(path):
                        raise IOError(
                            'Could not open file: {}, did you install vaex-hdf5? Is the format supported?'
                            .format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = vaex.convert._convert_name(filenames,
                                                               shuffle=shuffle)
                if os.path.exists(
                        filename_hdf5) and convert:  # also check mtime
                    df = vaex.open(filename_hdf5)
                else:
                    dfs = []
                    for filename in filenames:
                        dfs.append(
                            vaex.open(filename,
                                      convert=bool(convert),
                                      shuffle=shuffle,
                                      **kwargs))
                    df = vaex.concat(dfs)
                    if convert:
                        if shuffle:
                            df = df.shuffle()
                        df.export_hdf5(filename_hdf5)
                        df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise

Example #5

Show file

File: common.py Project: stjordanis/vaex

def tornado_client(webserver, event_loop):
    client = vaex.connect("%s://localhost:%d" % (scheme, webserver.port))
    yield client
    client.close()

Example #6

Show file

File: token_test.py Project: zhenyu-captain/vaex

def server(vaex_server):
    server = vaex.connect("%s://localhost:%d" % (scheme, test_port))
    yield server
    server.close()