def tornado_client(webserver, df_server, df_server_huge, event_loop): df = df_server df.drop('obj', inplace=True) df.drop('datetime', inplace=True) df.drop('timedelta', inplace=True) webserver.set_datasets([df, df_server_huge]) client = vaex.connect("%s://localhost:%d" % (scheme, test_port)) yield client client.close()
def open(path, convert=False, shuffle=False, copy_index=False, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: convert files to an hdf5 file for optimization, can also be a path :param bool shuffle: shuffle converted DataFrame or not :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :param bool copy_index: copy index when source is read via pandas :return: return a DataFrame on success, otherwise None :rtype: DataFrame S3 support: Vaex supports streaming of hdf5 files from Amazon AWS object storage S3. Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access is as fast as native disk access. The following url parameters control S3 options: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) * profile_name and other arguments are passed to :py:class:`s3fs.core.S3FileSystem` All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True) # Note that anon is a boolean, not the string 'true' >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile_name=myprofile') GCS support: Vaex supports streaming of hdf5 files from Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access is as fast as native disk access. The following url parameters control GCS options: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex try: if path in aliases: path = aliases[path] if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): # TODO: think about https and wss server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = { key: values[0] for key, values in parse_qs(url.query).items() } if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) else: import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: # TODO: can we do glob with s3? if path.startswith('s3://'): filenames.append(path) elif path.startswith('gs://'): filenames.append(path) else: # sort to get predictable behaviour (useful for testing) filenames.extend(list(sorted(glob.glob(path)))) ds = None if len(filenames) == 0: raise IOError( 'Could not open file: {}, it does not exist'.format(path)) filename_hdf5 = _convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] naked_path = path if '?' in naked_path: naked_path = naked_path[:naked_path.index('?')] ext = os.path.splitext(naked_path)[1] if os.path.exists( filename_hdf5) and convert: # also check mtime? ds = vaex.file.open(filename_hdf5) else: if ext == '.csv' or naked_path.endswith( ".csv.bz2" ): # special support for csv.. should probably approach it a different way csv_convert = filename_hdf5 if convert else False ds = from_csv(path, copy_index=copy_index, convert=csv_convert, **kwargs) else: ds = vaex.file.open(path, *args, **kwargs) if convert and ds: ds.export_hdf5(filename_hdf5, shuffle=shuffle) ds = vaex.file.open( filename_hdf5 ) # argument were meant for pandas? if ds is None: if os.path.exists(path): raise IOError( 'Could not open file: {}, did you install vaex-hdf5? Is the format supported?' .format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = _convert_name(filenames, shuffle=shuffle) if os.path.exists( filename_hdf5) and convert: # also check mtime ds = open(filename_hdf5) else: # with ProcessPoolExecutor() as executor: # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs) dfs = [] for filename in filenames: dfs.append( open(filename, convert=bool(convert), shuffle=shuffle, **kwargs)) ds = concat(dfs) if convert: ds.export_hdf5(filename_hdf5, shuffle=shuffle) ds = vaex.file.open(filename_hdf5) if ds is None: raise IOError('Unknown error opening: {}'.format(path)) return ds except: logging.getLogger("vaex").error("error opening %r" % path) raise
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'`` The conversion is skipped if the input file or conversion argument did not change. :param progress: (_Only applies when convert is not False_) {progress} :param bool shuffle: shuffle converted DataFrame or not :param dict fs_options: Extra arguments passed to an optional file system if needed: * Amazon AWS S3 * `anonymous` - access file without authentication (public files) * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file * `secret_key` - AWS secret key, similar to `access_key` * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided. * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio * Google Cloud Storage * :py:class:`gcsfs.core.GCSFileSystem` In addition you can pass the boolean "cache" option. :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table". :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty. :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame Cloud storage support: Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access is as fast as native disk access. The following common fs_options are used for S3 access: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) All fs_options can also be encoded in the file path as a query string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}}) >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}}) >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject') Google Cloud Storage support: The following fs_options are used for GCP access: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}}) >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: if not isinstance(path, (list, tuple)): # remote and clusters only support single path, not a list path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+wss://") or path.startswith("wss://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = {key: values[0] for key, values in parse_qs(url.query).items()} if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert( path_input=path, fs_options_input=fs_options, fs_input=fs, path_output=path_output, fs_options_output=fs_options, fs_output=fs, progress=progress, *args, **kwargs ) ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs) else: ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists(filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5, progress=progress) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logger.exception("error opening %r" % path) raise
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: convert files to an hdf5 file for optimization, can also be a path :param bool shuffle: shuffle converted DataFrame or not :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame S3 support: Vaex supports streaming of hdf5 files from Amazon AWS object storage S3. Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access is as fast as native disk access. The following url parameters control S3 options: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem` All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True) # Note that anon is a boolean, not the string 'true' >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile') GCS support: Vaex supports streaming of hdf5 files from Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access is as fast as native disk access. The following url parameters control GCS options: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): # TODO: think about https and wss server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = { key: values[0] for key, values in parse_qs(url.query).items() } if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) else: import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend( list(sorted(vaex.file.glob(path, **kwargs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, convert=convert, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert(path_input=path, fs_options_input=fs_options, path_output=path_output, fs_options_output=fs_options, *args, **kwargs) ds = vaex.dataset.open(path_output, fs_options=fs_options) else: ds = vaex.dataset.open(path, fs_options=fs_options) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError( 'Could not open file: {}, did you install vaex-hdf5? Is the format supported?' .format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists( filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append( vaex.open(filename, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logging.getLogger("vaex").error("error opening %r" % path) raise
def tornado_client(webserver, event_loop): client = vaex.connect("%s://localhost:%d" % (scheme, webserver.port)) yield client client.close()
def server(vaex_server): server = vaex.connect("%s://localhost:%d" % (scheme, test_port)) yield server server.close()