def test_concat_functions(): def foo(a, b): return a + b df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) df1.add_function('foo', foo) df2.add_function('foo', foo) # w has same expression and function df1['w'] = df1.func.foo(df1.x, df1.y) df2['w'] = df2.func.foo(df2.x, df2.y) assert df1.w.tolist() == [3] df = vaex.concat([df1, df2]) assert df.w.tolist() == [1 + 2, 2 + 3] # now bar is a new function def bar1(a, b): return a + b def bar2(a, b): return a + b df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) df1.add_function('bar', bar1) df2.add_function('bar', bar2) with pytest.raises(ValueError): df = vaex.concat([df1, df2])
def test_concat(): x1, y1, z1 = np.arange(3), np.arange(3, 0, -1), np.arange(10, 13) x2, y2, z2 = np.arange(3, 6), np.arange(0, -3, -1), np.arange(13, 16) x3, y3, z3 = np.arange(6, 9), np.arange(-3, -6, -1), np.arange(16, 19) w1, w2, w3 = np.array(['cat']*3), np.array(['dog']*3), np.array(['fish']*3) x = np.concatenate((x1, x2, x3)) y = np.concatenate((y1, y2, y3)) z = np.concatenate((z1, z2, z3)) w = np.concatenate((w1, w2, w3)) ds = vaex.from_arrays(x=x, y=y, z=z, w=w) ds1 = vaex.from_arrays(x=x1, y=y1, z=z1, w=w1) ds2 = vaex.from_arrays(x=x2, y=y2, z=z2, w=w2) ds3 = vaex.from_arrays(x=x3, y=y3, z=z3, w=w3) dd = vaex.concat([ds1, ds2]) ww = ds1.concat(ds2) # Test if the concatination of two arrays with the vaex method is the same as with the dataset method assert (np.array(dd.evaluate('x,y,z,w'.split(','))) == np.array(ww.evaluate('x,y,z,w'.split(',')))).all() # Test if the concatination of multiple datasets works dd = vaex.concat([ds1, ds2, ds3]) assert (np.array(dd.evaluate('x')) == np.array(ds.evaluate('x'))).all() assert (np.array(dd.evaluate('y')) == np.array(ds.evaluate('y'))).all() assert (np.array(dd.evaluate('z')) == np.array(ds.evaluate('z'))).all() assert (np.array(dd.evaluate('w')) == np.array(ds.evaluate('w'))).all() # Test if the concatination of concatinated datasets works dd1 = vaex.concat([ds1, ds2]) dd2 = vaex.concat([dd1, ds3]) assert (np.array(dd2.evaluate('x')) == np.array(ds.evaluate('x'))).all() assert (np.array(dd2.evaluate('y')) == np.array(ds.evaluate('y'))).all() assert (np.array(dd2.evaluate('z')) == np.array(ds.evaluate('z'))).all() assert (np.array(dd2.evaluate('w')) == np.array(ds.evaluate('w'))).all()
def _from_csv_convert_and_read(filename_or_buffer, path_output, chunk_size, fs_options, fs=None, copy_index=False, progress=None, **kwargs): # figure out the CSV file path csv_path = vaex.file.stringyfy(filename_or_buffer) path_output_bare, ext, _ = vaex.file.split_ext(path_output) combined_hdf5 = _convert_name(csv_path) # convert CSV chunks to separate HDF5 files import pandas as pd converted_paths = [] # we don't have indeterminate progress bars, so we cast it to truethy progress = bool(progress) if progress is not None else False if progress: print("Converting csv to chunk files") with vaex.file.open(filename_or_buffer, fs_options=fs_options, fs=fs, for_arrow=True) as f: csv_reader = pd.read_csv(filename_or_buffer, chunksize=chunk_size, **kwargs) for i, df_pandas in enumerate(csv_reader): df = vaex.from_pandas(df_pandas, copy_index=copy_index) chunk_name = f'{path_output_bare}_chunk_{i}{ext}' df.export(chunk_name) converted_paths.append(chunk_name) log.info('saved chunk #%d to %s' % (i, chunk_name)) if progress: print("Saved chunk #%d to %s" % (i, chunk_name)) # combine chunks into one HDF5 file if len(converted_paths) == 1: # no need to merge several HDF5 files os.rename(converted_paths[0], path_output) else: if progress: print('Converting %d chunks into single file %s' % (len(converted_paths), path_output)) log.info('converting %d chunks into single file %s' % (len(converted_paths), path_output)) dfs = [vaex.open(p) for p in converted_paths] df_combined = vaex.concat(dfs) df_combined.export(path_output, progress=progress) log.info('deleting %d chunk files' % len(converted_paths)) for df, df_path in zip(dfs, converted_paths): try: df.close() os.remove(df_path) except Exception as e: log.error( 'Could not close or delete intermediate file %s used to convert %s to single file: %s', (df_path, csv_path, path_output))
def df_concat_cache(ds_trimmed_cache): df = ds_trimmed_cache df1 = df[:2] # length 2 df2 = df[2:3] # length 1 df3 = df[3:7] # length 4 df4 = df[7:10] # length 3 return vaex.concat([df1, df2, df3, df4])
def _from_csv_convert_and_read(filename_or_buffer, maybe_convert_path, chunk_size, fs_options, fs=None, copy_index=False, **kwargs): # figure out the CSV file path if isinstance(maybe_convert_path, str): csv_path = re.sub(r'\.hdf5$', '', str(maybe_convert_path), flags=re.IGNORECASE) elif isinstance(filename_or_buffer, str): csv_path = filename_or_buffer else: raise ValueError( 'Cannot derive filename to use for converted HDF5 file, ' 'please specify it using convert="my.csv.hdf5"') combined_hdf5 = _convert_name(csv_path) # convert CSV chunks to separate HDF5 files import pandas as pd converted_paths = [] with vaex.file.open(filename_or_buffer, fs_options=fs_options, fs=fs, for_arrow=True) as f: csv_reader = pd.read_csv(filename_or_buffer, chunksize=chunk_size, **kwargs) for i, df_pandas in enumerate(csv_reader): df = vaex.from_pandas(df_pandas, copy_index=copy_index) filename_hdf5 = _convert_name(csv_path, suffix='_chunk%d' % i) df.export_hdf5(filename_hdf5) converted_paths.append(filename_hdf5) log.info('saved chunk #%d to %s' % (i, filename_hdf5)) # combine chunks into one HDF5 file if len(converted_paths) == 1: # no need to merge several HDF5 files os.rename(converted_paths[0], combined_hdf5) else: log.info('converting %d chunks into single HDF5 file %s' % (len(converted_paths), combined_hdf5)) dfs = [vaex.open(p) for p in converted_paths] df_combined = vaex.concat(dfs) df_combined.export_hdf5(combined_hdf5) log.info('deleting %d chunk files' % len(converted_paths)) for df, df_path in zip(dfs, converted_paths): try: df.close() os.remove(df_path) except Exception as e: log.error( 'Could not close or delete intermediate hdf5 file %s used to convert %s to hdf5: %s' % (df_path, csv_path, e))
def test_concat_missing_values(): df1 = vaex.from_arrays(x=[1, 2, 3], y=[np.nan, 'b', 'c']) df2 = vaex.from_arrays(x=[4, 5, np.nan], y=['d', 'e', 'f']) df = vaex.concat([df1, df2]) repr(df.head(4)) repr(df.tail(4)) assert len(df) == 6
def test_concat_keep_virtual(): df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) # w has same expression df1['w'] = df1.x + df1.y df2['w'] = df2.x + df2.y df = vaex.concat([df1, df2]) assert 'w' in df.virtual_columns assert 'w' not in df.get_column_names(virtual=False) assert 'w' not in df.dataset
def _iris(name, iris_previous, N): filename = os.path.join(vaex.utils.get_private_dir('data'), name + '.hdf5') if os.path.exists(filename): return vaex.open(filename) else: iris = iris_previous() repeat = int(np.ceil(N / len(iris))) ds = vaex.concat([iris] * repeat) ds.export_hdf5(filename) return vaex.open(filename)
def test_concat_unequals_virtual_columns(): ds1 = vaex.from_scalars(x=1, y=2) ds2 = vaex.from_scalars(x=2, y=3) # w has same expression ds1['w'] = ds1.x + ds1.y ds2['w'] = ds2.x + ds2.y # z does not ds1['z'] = ds1.x + ds1.y ds2['z'] = ds2.x * ds2.y ds = vaex.concat([ds1, ds2]) assert ds.w.tolist() == [1 + 2, 2 + 3] assert ds.z.tolist() == [1 + 2, 2 * 3]
def test_concat_unequals_virtual_columns(): df1 = vaex.from_scalars(x=1, y=2) df2 = vaex.from_scalars(x=2, y=3) # w has same expression df1['w'] = df1.x + df1.y df2['w'] = df2.x + df2.y # z does not df1['z'] = df1.x + df1.y df2['z'] = df2.x * df2.y df = vaex.concat([df1, df2]) assert df.w.tolist() == [1 + 2, 2 + 3] assert df.z.tolist() == [1 + 2, 2 * 3]
def test_concat_timestamp(): df1 = pa.Table.from_arrays( [pa.array(['2020-01-31', '2020-01-31']).cast('timestamp[us]')], names=['ts']) df2 = pa.Table.from_arrays( [pa.array(['2020-12-31', '2020-12-31']).cast('timestamp[ns]')], names=['ts']) df1_vx = vaex.from_arrow_table(df1) df2_vx = vaex.from_arrow_table(df2) df = vaex.concat([df1_vx, df2_vx]) assert df.ts.tolist() == df1['ts'].to_pylist() + df2['ts'].to_pylist() assert df.ts.dtype.internal == pa.timestamp('ns')
def test_concat_mixed_types(): x1 = np.zeros(3) + np.nan x2 = vaex.string_column(['hi', 'there']) df1 = vaex.from_arrays(x=x1) df2 = vaex.from_arrays(x=x2) df = vaex.concat([df1, df2]) assert df2.x.dtype == df.x.dtype, "expect 'upcast' to string" assert df[:2].x.tolist() == ['nan', 'nan'] assert df[1:4].x.tolist() == ['nan', 'nan', 'hi'] assert df[2:4].x.tolist() == ['nan', 'hi'] assert df[3:4].x.tolist() == ['hi'] assert df[3:5].x.tolist() == ['hi', 'there']
def test_concat(df_file, tmpdir): path = tmpdir / 'test2.hdf5' df_file[['x']].export(path) df_concat = vaex.open(path) df = vaex.concat([df_file, df_concat]) assert len(pickle.dumps(df)) < 2000 df2 = pickle.loads(pickle.dumps(df)) assert len(df) == len(df_file) * 2 assert len(df2) == len(df_file) * 2 # assert df.compare(df2) == ([], [], [], []) assert df2.x.count() == len(df_file) * 2, 'x is repeated' assert df2.x.sum() == df_file.x.sum() * 2, 'x is repeated' assert df2.y.sum() == df_file.y.sum(), 'y is not repeated'
def test_from_csv(): # can read with default options df = vaex.from_csv(csv_path, copy_index=True) _assert_csv_content(df, with_index=True) # can read an empty CSV df = vaex.from_csv(os.path.join(path, 'data', 'empty.csv')) assert len(df) == 0 # can read as chunks iterator df_iterator = vaex.from_csv(csv_path, chunk_size=1) df1 = next(df_iterator) assert len(df1) == 1 df2, df3 = next(df_iterator), next(df_iterator) with pytest.raises(StopIteration): next(df_iterator) _assert_csv_content(vaex.concat([df1, df2, df3]))
def _from_dataframe_to_vaex(df: DataFrameObject) -> vaex.dataframe.DataFrame: """ Note: we need to implement/test support for bit/byte masks, chunk handling, etc. """ # Iterate through the chunks dataframe = [] _buffers = [] for chunk in df.get_chunks(): # We need a dict of columns here, with each column being an expression. columns = dict() _k = _DtypeKind _buffers_chunks = [] # hold on to buffers, keeps memory alive for name in chunk.column_names(): if not isinstance(name, str): raise ValueError(f"Column {name} is not a string") if name in columns: raise ValueError(f"Column {name} is not unique") col = chunk.get_column_by_name(name) if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): # Simple numerical or bool dtype, turn into arrow array columns[name], _buf = convert_column_to_ndarray(col) elif col.dtype[0] == _k.CATEGORICAL: columns[name], _buf = convert_categorical_column(col) elif col.dtype[0] == _k.STRING: columns[name], _buf = convert_string_column(col) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") _buffers_chunks.append(_buf) dataframe.append(vaex.from_dict(columns)) # chunk buffers are added to list of all buffers _buffers.append(_buffers_chunks) if df.num_chunks() == 1: _buffers = _buffers[0] df_new = vaex.concat(dataframe) df_new._buffers = _buffers return df_new
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x*2}) assert df.get_column_names() == ['X!1', 'class'] assert df.get_column_names(alias=False) != ['X!1', 'class'] assert df['X!1'].tolist() == x.tolist() assert (df['X!1']*2).tolist() == (x*2).tolist() assert (df['class']).tolist() == (x*2).tolist() assert 'X!1' in df._column_aliases assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist() df_concat = vaex.concat([df, df]) assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x*2}) assert str(df['X!1']) != 'X!1', "invalid identifier cannot be an expression" assert str(df['class']) != 'class', "keyword cannot be an expression" assert df.get_column_names() == ['X!1', 'class'] assert df['X!1'].tolist() == x.tolist() assert (df['X!1']*2).tolist() == (x*2).tolist() assert (df['class']).tolist() == (x*2).tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist() df_concat = vaex.concat([df, df]) assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
def predict(self, instances, **kwargs): if isinstance(instances[0], list): data = np.asarray(instances).T df = vaex.from_arrays(Arrival_Time=data[0], Creation_Time=data[1], x=data[2], y=data[3], z=data[4]) elif isinstance(instances[0], dict): dfs = [] for instance in instances: df = vaex.from_dict(instance) dfs.append(df) df = vaex.concat(dfs) else: return ['invalid input format'] df.state_set(self.state, set_filter=False) return df.pred_name.tolist()
def predict(data: Data): instances = data.instances if isinstance(instances[0], list): data = np.asarray(instances).T df = vaex.from_arrays(Arrival_Time=data[0], Creation_Time=data[1], x=data[2], y=data[3], z=data[4]) elif isinstance(instances[0], dict): dfs = [] for instance in instances: df = vaex.from_dict(instance) dfs.append(df) df = vaex.concat(dfs) else: return {'predictions': 'invalid input format'} df.state_set(global_items['state'], set_filter=False) return {'predictions': df.pred_name.tolist()}
def test_concat_strict(df_factory): df1 = df_factory(x=[1, 2]) df2 = df_factory(x=[3, None, 4]) df = vaex.concat([df1, df2], resolver='strict') assert df.x.tolist() == [1, 2, 3, None, 4]
def test_concat_arrow_strings(): df1 = vaex.from_arrays(x=vaex.string_column(['aap', 'noot', 'mies'])) df2 = vaex.from_arrays(x=vaex.string_column(['a', 'b', 'c'])) df = vaex.concat([df1, df2]) assert df.data_type('x') == df1.data_type('x') assert df.x.tolist() == ['aap', 'noot', 'mies', 'a', 'b', 'c']
def concat(*types): dfs = [ vaex.from_arrays(x=np.arange(3, dtype=dtype)) for dtype in types ] dataset_concat = vaex.concat(dfs) return dataset_concat
def test_concat_chunk_iterator(l1, l2): i1 = 0 i2 = i1 + l1 i3 = i2 + l2 x = np.arange(10) y = x**2 g = x // 3 ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g) df_original = df = vaex.from_dataset(ds) df1 = df[i1:i2] df2 = df[i2:i3] df3 = df[i3:] df = vaex.concat([df1, df2, df3]) ds_full = ds = df.dataset # very similar to the arrow/datase_test.py parquet test iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() # no columns iter = ds.chunk_iterator([], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 ds = ds[1:10] assert 'x' in ds assert ds.row_count == 9 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) if i == 4: assert i1 == 8 assert i2 == 9 else: assert i1 == i * 2 assert i2 == (i + 1) * 2 # chunks = chunks chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() ds = ds[1:9] assert ds.row_count == 8 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() # no columns iter = ds.chunk_iterator([], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 # again, but here we skip of total of a chunk_size at the end ds = ds_full[:8] # import pdb; pdb.set_trace() assert ds.row_count == 8 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() for i in range(9): for j in range(i + 1, 10): ds = ds_full.slice(i, j) values = [] for i1, i2, chunks in ds.chunk_iterator(['x']): values.extend(chunks['x'].tolist()) assert x[i:j].tolist() == values assert df.x.tolist() == x.tolist() assert df.g.tolist() == g.tolist() ds_dropped = ds.dropped('x') assert 'x' not in ds_dropped
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'`` The conversion is skipped if the input file or conversion argument did not change. :param progress: (_Only applies when convert is not False_) {progress} :param bool shuffle: shuffle converted DataFrame or not :param dict fs_options: Extra arguments passed to an optional file system if needed: * Amazon AWS S3 * `anonymous` - access file without authentication (public files) * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file * `secret_key` - AWS secret key, similar to `access_key` * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided. * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio * Google Cloud Storage * :py:class:`gcsfs.core.GCSFileSystem` In addition you can pass the boolean "cache" option. :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table". :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty. :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame Cloud storage support: Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access is as fast as native disk access. The following common fs_options are used for S3 access: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) All fs_options can also be encoded in the file path as a query string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}}) >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}}) >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject') Google Cloud Storage support: The following fs_options are used for GCP access: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}}) >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: if not isinstance(path, (list, tuple)): # remote and clusters only support single path, not a list path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+wss://") or path.startswith("wss://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = {key: values[0] for key, values in parse_qs(url.query).items()} if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert( path_input=path, fs_options_input=fs_options, fs_input=fs, path_output=path_output, fs_options_output=fs_options, fs_output=fs, progress=progress, *args, **kwargs ) ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs) else: ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists(filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5, progress=progress) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logger.exception("error opening %r" % path) raise
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: convert files to an hdf5 file for optimization, can also be a path :param bool shuffle: shuffle converted DataFrame or not :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame S3 support: Vaex supports streaming of hdf5 files from Amazon AWS object storage S3. Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access is as fast as native disk access. The following url parameters control S3 options: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem` All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True) # Note that anon is a boolean, not the string 'true' >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile') GCS support: Vaex supports streaming of hdf5 files from Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access is as fast as native disk access. The following url parameters control GCS options: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): # TODO: think about https and wss server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = { key: values[0] for key, values in parse_qs(url.query).items() } if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) else: import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend( list(sorted(vaex.file.glob(path, **kwargs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, convert=convert, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert(path_input=path, fs_options_input=fs_options, path_output=path_output, fs_options_output=fs_options, *args, **kwargs) ds = vaex.dataset.open(path_output, fs_options=fs_options) else: ds = vaex.dataset.open(path, fs_options=fs_options) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError( 'Could not open file: {}, did you install vaex-hdf5? Is the format supported?' .format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists( filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append( vaex.open(filename, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logging.getLogger("vaex").error("error opening %r" % path) raise
def compute_flow_data(days, hours, zone): logger.info("Compute: flow data: days=%r hours=%r zone=%r", days, hours, zone) df, selection = create_selection(days, hours) df.select(df.pickup_zone == zone, mode='and') selection = True df_flow_zone = df.groupby( [df.pickup_zone, df.dropoff_zone], agg={'count_trips': vaex.agg.count(selection=selection)}) # sort descending so we can take the top N df_flow_zone = df_flow_zone.sort('count_trips', ascending=False) df_flow_zone['pickup_borough'] = df_flow_zone.pickup_zone.map( zone_index_to_borough_index) df_flow_zone['dropoff_borough'] = df_flow_zone.dropoff_zone.map( zone_index_to_borough_index) pickup_zone = zone pickup_borough = zone_index_to_borough_index[pickup_zone] # Now to include the total count of all trips for zones that are not the top N # only trips leaving from this zone and to a different borough df_outflow_zone = df_flow_zone[(df_flow_zone.pickup_zone == pickup_zone)] df_outflow_zone = df_outflow_zone[ df_outflow_zone.dropoff_borough != pickup_borough] df_outflows_top = [] df_outflows_rest = [] for dropoff_borough in range(6): if dropoff_borough == pickup_borough: continue # outflow from this zone, to a particular borough df_outflow_zone_borough = df_outflow_zone[ df_outflow_zone.dropoff_borough == dropoff_borough] if len(df_outflow_zone_borough): n_max = min(len(df_outflow_zone_borough), n_largest) # top N zones of outflow from this zone, to a particular borough df_outflows_top.append(df_outflow_zone_borough[:n_max]) if len(df_outflow_zone_borough) > n_largest: count_other = df_outflow_zone_borough[n_largest:][ 'count_trips'].sum() # rest of the outflow from this zone, to a particular borough df_outflows_rest.append( vaex.from_scalars(pickup_borough=pickup_borough, dropoff_borough=dropoff_borough, dropoff_zone=len(zone_index_to_name) + dropoff_borough, count_trips=count_other)) df_outflow_top = vaex.concat(df_outflows_top) df_outflow_borough = df_outflow_zone.groupby( ['pickup_borough', 'dropoff_borough'], agg={'count_trips': vaex.agg.sum('count_trips')}) if df_outflows_rest: df_outflow_rest = vaex.concat(df_outflows_rest) else: # create an empy dataframe with the same schema to make the rest of the code simpler df_outflow_rest = vaex.from_scalars(pickup_borough=-1, dropoff_borough=-1, dropoff_zone=-1, count_trips=-1)[:0] # return as dict and lists so it can be serialized by the memoize decorator flow_data = dict( outflow_top=df_outflow_top.to_dict(array_type='list'), outflow_rest=df_outflow_rest.to_dict(array_type='list'), outflow_borough=df_outflow_borough.to_dict(array_type='list')) return flow_data