def combine_csv(): # read from the gtfs records with zipfile.ZipFile(gtfs_csv_zip, "r") as zip: dirs = zip.namelist() # merge all the csv's in the zip file combined_csv = pd.concat( [pd.read_csv(zip.open(f), header=None) for f in dirs]) combined_csv.columns = entity_cols # dropping duplicates combined_csv = combined_csv.drop_duplicates(subset=entity_cols[:-1]) # convert to csv combined_csv.to_csv(gtfs_final_csv_path, index=False, header=True) print(f"finished combining the zip files, time: {duration()}") if os.path.exists(gtfs_final_hdf5_path): os.remove(gtfs_final_hdf5_path) vaex.from_csv(gtfs_final_csv_path, convert=True, copy_index=False, chunk_size=1000000) print(f"finished converting to hdf5, time: {duration()}") return
def read_states(statesf): """Exomol IO for a state file. Args: statesf: state file Returns: states data in pandas DataFrame Note: i=state counting number E=state energy g=state degeneracy J=total angular momentum See Table 11 in https://arxiv.org/pdf/1603.05890.pdf """ try: dat = vaex.from_csv(statesf, compression='bz2', sep='\s+', usecols=range(4), names=('i', 'E', 'g', 'J'), convert=True) except: dat = vaex.read_csv(statesf, sep='\s+', usecols=range(4), names=('i', 'E', 'g', 'J'), convert=True) return dat
def csv_to_df(file: str): """Convert csv into datafram or hdf5 file. Args: file (str): Path to input file. filePattern (str): extension of file to convert. Returns: Vaex dataframe """ logger.info("csv_to_df: Copy csv file into outDir for processing...") file_name = Path(file).stem output = file_name + ".csv" outputfile = os.path.join(outDir, output) shutil.copyfile(file, outputfile) logger.info("csv_to_df: Checking size of csv file...") # Open csv file and count rows in file with open(outputfile, "r", encoding="utf-8") as fr: ncols = len(fr.readline().split(",")) chunk_size = max([2**24 // ncols, 1]) logger.info("csv_to_df: # of columns are: " + str(ncols)) # Convert large csv files to hdf5 if more than 1,000,000 rows logger.info("csv_to_df: converting file into hdf5 format") df = vaex.from_csv(outputfile, convert=True, chunk_size=chunk_size) return df
def extract_file_data(self, file_path, file_name): """ :param file_path: This is the data location,ex: csv file location :param file_name: This is the file name :return: """ print('extract_file_data') size = round(get_file_size(file_path, SIZE_UNIT.MB), 2) print('Size of file is : ', size, 'MB') file_type_list = str(file_name).split('.') file_type = file_type_list[len(file_type_list) - 1] if file_type == 'csv': df = vaex.from_csv(file_path, copy_index=False) elif file_type == 'hdf5': df = vaex.open(file_path) elif file_type == 'parquet': df = vaex.open(file_path) elif file_type =='s3': df = vaex.open(file_path) self.column_datatype_list = [] column_data_types_raw = list(df.dtypes) for x in range(len(column_data_types_raw)): self.column_datatype_list.append(str(column_data_types_raw[x])) raw_detected_column_datatype=self.column_datatype_list return df,raw_detected_column_datatype
def read_trans(transf): """Exomol IO for a transition file. Args: transf: transition file Returns: transition data in vaex DataFrame Note: i_upper=Upper state counting number i_lower=Lower state counting number A=Einstein coefficient in s-1 nu_lines=transition wavenumber in cm-1 See Table 12 in https://arxiv.org/pdf/1603.05890.pdf """ try: dat = vaex.from_csv(transf, compression='bz2', sep='\s+', names=('i_upper', 'i_lower', 'A', 'nu_lines'), convert=True) except: dat = vaex.read_csv(transf, sep='\s+', names=('i_upper', 'i_lower', 'A', 'nu_lines'), convert=True) return dat
def _load_csvs_into_dfs(self, start_time, end_time): if settings.PRINT_EVENTS: print("读取CSV文件...") # 得到csv文件名称 hdf5_files = self._obtain_stock_hdf5_files() csv_dfs = {} if len(hdf5_files) == 0: #将csv文件转为hdf5文件 csv_files = self._obtain_stock_csv_files() for csv_file in csv_files: stock_symbol = self._obtain_stock_symbol_from_filename( csv_file) csv_df = vaex.from_csv(csv_file, convert=True, chunk_size=5000000) #暂时不知道如何修改列名 csv_df['date_time'] = csv_df['Unnamed: 0'] if stock_symbol not in csv_dfs.keys(): csv_dfs[stock_symbol] = {} if 'Tick' in csv_file: csv_dfs[stock_symbol]['snapshot'] = csv_df if settings.PRINT_EVENTS: print("加载 '%s' %s snapshot数据..." % (self.code, stock_symbol)) elif 'Transaction' in csv_file: csv_dfs[stock_symbol]['tick'] = csv_df if settings.PRINT_EVENTS: print("加载 '%s' %s tick数据..." % (self.code, stock_symbol)) else: for csv_file in hdf5_files: stock_symbol = self._obtain_stock_symbol_from_filename( csv_file) csv_df = self._load_hdf5_into_df(csv_file, start_time, end_time) if stock_symbol not in csv_dfs.keys(): csv_dfs[stock_symbol] = {} if 'Tick' in csv_file: csv_dfs[stock_symbol]['snapshot'] = csv_df if settings.PRINT_EVENTS: print("加载 '%s' %s snapshot数据..." % (self.code, stock_symbol)) elif 'Transaction' in csv_file: csv_dfs[stock_symbol]['tick'] = csv_df if settings.PRINT_EVENTS: print("加载 '%s' %s tick数据..." % (self.code, stock_symbol)) return csv_dfs
def test_from_csv(): # can read with default options df = vaex.from_csv(csv_path, copy_index=True) _assert_csv_content(df, with_index=True) # can read an empty CSV df = vaex.from_csv(os.path.join(path, 'data', 'empty.csv')) assert len(df) == 0 # can read as chunks iterator df_iterator = vaex.from_csv(csv_path, chunk_size=1) df1 = next(df_iterator) assert len(df1) == 1 df2, df3 = next(df_iterator), next(df_iterator) with pytest.raises(StopIteration): next(df_iterator) _assert_csv_content(vaex.dataframe.DataFrameConcatenated([df1, df2, df3]))
def test_from_csv(): # can read with default options df = vaex.from_csv(csv_path, copy_index=True) _assert_csv_content(df, with_index=True) # can read an empty CSV df = vaex.from_csv(os.path.join(path, 'data', 'empty.csv')) assert len(df) == 0 # can read csv with no header df = vaex.from_csv(os.path.join(path, 'data', 'noheader.csv'), header=None) assert len(df) == 5 assert df.get_column_names() == ['0', '1', '2'] # can read as chunks iterator df_iterator = vaex.from_csv(csv_path, chunk_size=1) df1 = next(df_iterator) assert len(df1) == 1 df2, df3 = next(df_iterator), next(df_iterator) with pytest.raises(StopIteration): next(df_iterator) _assert_csv_content(vaex.concat([df1, df2, df3]))
def test_from_big_csv_convert(): # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_0.csv' csv = '/Users/byaminov/fun/datasets/yellow_tripdata_2019-01.csv' # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv' os.remove(csv + '.hdf5') start = datetime.now() df = vaex.from_csv(csv, convert=True) duration = datetime.now() - start print('it took {} to convert {:,} rows, which is {:,} rows per second'. format(duration, df.length(), int(df.length() / duration.total_seconds()))) assert df.length() == 7_667_792
def create_scats_ml_model(): print("starting scats ml modeling") # load existing csv into vaex dataframe if not os.path.exists(finalScatsPath + ".hdf5"): vaex.from_csv(finalScatsPath, convert=True, copy_index=False, chunk_size=1_000_000) df = vaex.open(finalScatsPath + ".hdf5", shuffle=True) # transform the features into more machine learning friendly vars pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca") df = pca_coord.fit_transform(df) cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24) df = cycl_transform_hour.fit_transform(df) cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7) df = cycl_transform_dow.fit_transform(df) print("dataWrangling done, ready to create model, time: {}s".format(duration())) # create a randomForestRegression model vaex_model = Predictor( features=df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]"), target="avg_vol", model=RandomForestRegressor(random_state=42, n_estimators=7 * 24), prediction_name="p_avg_vol", ) # here we fit and train the model with parallel_backend("threading", n_jobs=8): vaex_model.fit(df) print("\n\nmodel created, time: {}s".format(duration())) dump(value=vaex_model, filename=model_out, compress=3) print("model written to output, time: {}s".format(duration())) return
def test_from_big_csv_read(): # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_0.csv' csv = '/Users/byaminov/fun/datasets/yellow_tripdata_2019-01.csv' # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv' start = datetime.now() read_length = 0 # for df in vaex.from_csv(csv, chunk_size=2_000_000): # read_length += len(df) read_length += len(vaex.from_csv(csv)) duration = datetime.now() - start print('it took {} to convert {:,} rows, which is {:,} rows per second'. format(duration, read_length, int(read_length / duration.total_seconds()))) assert read_length == 7_667_792
def test_from_csv_converting_in_chunks(): # can read several chunks with converting, intermediate files are deleted df = vaex.from_csv(csv_path, chunk_size=1, convert=True) _assert_csv_content(df) for filename in [ 'small3.csv_chunk0.hdf5', 'small3.csv_chunk1.hdf5', 'small3.csv_chunk2.hdf5' ]: assert not os.path.exists(os.path.join(path, 'data', filename)) assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5')) _cleanup_generated_files(df) # fails to convert if filename cannot be derived with pytest.raises(ValueError, match='Cannot convert.*'): with io.StringIO() as f: vaex.from_csv(f, convert=True) # f.name reveals the path with open(csv_path) as f: vaex.from_csv(f, convert=True) with open(csv_path) as f: converted_path = os.path.join(path, 'data', 'small3.my.csv.hdf5') df = vaex.from_csv(f, convert=converted_path) _assert_csv_content(df) assert os.path.exists(converted_path) _cleanup_generated_files(df) # reuses converted HDF5 file vaex.from_csv(csv_path, convert=True) assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5')) try: with pytest.raises(FileNotFoundError): os.rename(csv_path, csv_path + '_') df = vaex.from_csv(csv_path, convert=True) _assert_csv_content(df) _cleanup_generated_files(df) finally: os.rename(csv_path + '_', csv_path)
def ascii_to_vaex(asciifile, df_col=None, names=None, usecols=None, **kwargs): """Read raw DOLPHOT output photometry into Vaex. Inputs ------ asciifile : str, path object or file-like object Path to DOLPHOT photometry file; see `pandas.read_csv` df_col : pandas.DataFrame, optional DataFrame read in from DOLPHOT columns file; see `read_colfile` names : list-like, optional Sequence of column names; see `pandas.read_csv` Either `names` or `df_col` must be specified. usecols : list-like or callable, optional Subset of columns to be read in; see `pandas.read_csv` If None, assumes `names` corresponds to first N columns Default: None **kwargs Other keyword arguments to be passed to `vaex.from_csv` Returns ------- ds : vaex.dataframe.DataFrame Photometry table. """ if df_col is not None: names = df_col['names'].tolist() usecols = (df_col.index - 1).tolist() else: usecols = list(range(len(names))) if usecols is None else usecols # null mag values are 99.999, null mag err values are 9.999 na_values = { n: '99.999' for n in names if (n.endswith('VEGA') | n.endswith('TRANS')) } na_values.update({n: '9.999' for n in names if n.endswith('ERR')}) ds = vaex.from_csv(asciifile, names=names, usecols=usecols, header=None, delim_whitespace=True, na_values=na_values, float_precision='round_trip', **kwargs) return ds
def process_exp_files(expnum, exp_files): # Import vaex import vaex # Unpack exp_files exp_file, xtr_file = exp_files # Read in the exp_file exp_data = vaex.from_csv(exp_file, skipinitialspace=True, header=None, names=EXP_HEADER, squeeze=True, dtype=EXP_HEADER, copy_index=False) # Read in the xtr_file xtr_data = pd.read_csv(xtr_file, skipinitialspace=True, header=None, names=XTR_HEADER, squeeze=True, dtype=XTR_HEADER, usecols=range(len(XTR_HEADER)-1)) xtr_data = xtr_data.to_numpy()[0] # Check if the 'expnum' column contains solely expnum if not (exp_data['expnum'] == expnum).evaluate().all(): # If not, raise error and exit raise_error(f"Exposure file {exp_file!r} contains multiple exposures!") # Export vaex DataFrame to HDF5 exp_file_hdf5 = path.join(ARGS.mld, TEMP_EXP_FILE.format(expnum)) exp_data.export_hdf5(exp_file_hdf5) # Open master file with h5py.File(ARGS.master_file, 'r+') as m_file: # Check if this exposure has been processed before expnums = m_file['expnums']['expnum'] expnums = expnums if expnums.size else expnums['expnum'] index = np.nonzero(expnums == expnum)[0] # Save that this exposure has been processed if index.size: m_file['expnums'][index[0]] = (*xtr_data, path.getmtime(exp_file)) else: m_file['expnums'].resize(m_file.attrs['n_expnums']+1, axis=0) m_file['expnums'][-1] = (*xtr_data, path.getmtime(exp_file)) m_file.attrs['n_expnums'] += 1 # Return exp_file_hdf5 return(exp_file_hdf5)
def feats2csv(feat_scp, out_dir, utt2spk, spk_dict): os.makedirs(out_dir, exist_ok=True) feat_name = feat_scp.rstrip(".scp").split("/")[-1] # print(feat_name) feats = open(feat_scp, "r") out_csv = os.path.join(out_dir, feat_name + "." + args.data_type + ".csv") with open(out_csv, "w") as f: f.write("utt_id ark_path class_label\n") for feat in feats: utt_id, ark_path = feat.rstrip().split() class_label = spk_dict[utt2spk[utt_id]] f.write("%s %s %d\n" % (utt_id, ark_path, class_label)) feats.close() # convert csv to h5 file df = vaex.from_csv(out_csv, sep=" ") df = df.sort(by="class_label") h5_name = os.path.join(out_dir, feat_name + "." + args.data_type + ".hdf5") df.export(h5_name) df.export(out_csv)
def wrapper(): file_size = os.environ.get("FILE_SIZE", "m") file_path = f"/data/{file_size}.csv" print("Reading dataset") df = vaex.from_csv(file_path, convert=True, chunk_size=5_000_000, copy_index=False) size = round(Path(file_path).stat().st_size / 1024 / 1024 / 1024, 3) print("Transferring") startTime = time.time() try: func(df) except Exception: print(traceback.format_exc()) print(f"====> Transfer failed for the file whose size is {size}Gb") else: endTime = time.time() print(f"====> Transfered {size}Gb in" f"{round(endTime - startTime, 3)} seconds")
def test_from_csv_converting_in_chunks(): # can read several chunks with converting, intermediate files are deleted df = vaex.from_csv(csv_path, chunk_size=1, convert=True) _assert_csv_content(df) for filename in [ 'small3.csv_chunk0.hdf5', 'small3.csv_chunk1.hdf5', 'small3.csv_chunk2.hdf5' ]: assert not os.path.exists(os.path.join(path, 'data', filename)) assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5')) _cleanup_generated_files(df) # fails to convert if filename cannot be derived with pytest.raises( ValueError, match='Cannot derive filename to use for converted HDF5 file, ' 'please specify it using convert="my.csv.hdf5"'): with open(csv_path) as f: vaex.from_csv(f, convert=True) with open(csv_path) as f: converted_path = os.path.join(path, 'data', 'small3.my.csv.hdf5') df = vaex.from_csv(f, convert=converted_path) _assert_csv_content(df) assert os.path.exists(converted_path) _cleanup_generated_files(df) # reuses converted HDF5 file vaex.from_csv(csv_path, convert=True) assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5')) try: os.rename(csv_path, csv_path + '_') df = vaex.from_csv(csv_path, convert=True) _assert_csv_content(df) _cleanup_generated_files(df) except FileNotFoundError as e: assert False, "vaex.from_csv tried to read from CSV file while a converted HDF5 file existed: %s" % e finally: os.rename(csv_path + '_', csv_path)
def get_raw_dataframe_as_vaex(name): df = vaex.from_csv( "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=" + name + "&apikey=WCXVE7BAD668SJHL&datatype=csv") return df
import vaex import matplotlib.pyplot as plt if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('--divisor_sums_csv_path', type=str, help='The csv file containing divisor sum data') parser.add_argument('--divisor_sums_hdf5_path', type=str, help='The hdf5 file containing divisor sum data') args = parser.parse_args() if args.divisor_sums_csv_path: df = vaex.from_csv(args.divisor_sums_csv_path) else: # can convert to an hdf5 once using # vaex.from_csv('divisor_sums.csv', convert=True, chunk_size=5_000_000) df = vaex.open(args.divisor_sums_hdf5_path) df.select(df.log_n > 12.3, name='log_n_min') df.select(df.witness_value > 1.68, name='witness_value_min') df.viz.heatmap(df.log_n, df.witness_value, limits=['minmax', [1.68, 1.782]], selection=['witness_value_min', 'log_n_min'], colormap='coolwarm', ylabel='witness_value', xlabel='$\log(n)$',
import time from config import * import vaex # from_csv with convert=True writes the converted file back to the same folder # takes 346s (circa 5mins) to write to # 4835469885 Oct 23 17:46 pp-complete-202009.csv.hdf5 (4.8GB file) #RAW_DATA = '/home/ian/data/land_registry/pp-complete-202009.short.csv' print(f"Processing from {RAW_DATA}") t1 = time.time() df = vaex.from_csv(RAW_DATA, copy_index=False, chunk_size=None, convert=True, parse_dates=['date'], names=COLUMNS) print(f"Took {time.time()-t1:0.1}f")
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'`` The conversion is skipped if the input file or conversion argument did not change. :param progress: (_Only applies when convert is not False_) {progress} :param bool shuffle: shuffle converted DataFrame or not :param dict fs_options: Extra arguments passed to an optional file system if needed: * Amazon AWS S3 * `anonymous` - access file without authentication (public files) * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file * `secret_key` - AWS secret key, similar to `access_key` * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided. * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio * Google Cloud Storage * :py:class:`gcsfs.core.GCSFileSystem` In addition you can pass the boolean "cache" option. :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table". :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty. :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame Cloud storage support: Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access is as fast as native disk access. The following common fs_options are used for S3 access: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) All fs_options can also be encoded in the file path as a query string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}}) >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}}) >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject') Google Cloud Storage support: The following fs_options are used for GCP access: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}}) >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: if not isinstance(path, (list, tuple)): # remote and clusters only support single path, not a list path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+wss://") or path.startswith("wss://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = {key: values[0] for key, values in parse_qs(url.query).items()} if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert( path_input=path, fs_options_input=fs_options, fs_input=fs, path_output=path_output, fs_options_output=fs_options, fs_output=fs, progress=progress, *args, **kwargs ) ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs) else: ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists(filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5, progress=progress) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logger.exception("error opening %r" % path) raise
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: convert files to an hdf5 file for optimization, can also be a path :param bool shuffle: shuffle converted DataFrame or not :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame S3 support: Vaex supports streaming of hdf5 files from Amazon AWS object storage S3. Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access is as fast as native disk access. The following url parameters control S3 options: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem` All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True) # Note that anon is a boolean, not the string 'true' >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile') GCS support: Vaex supports streaming of hdf5 files from Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access is as fast as native disk access. The following url parameters control GCS options: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): # TODO: think about https and wss server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = { key: values[0] for key, values in parse_qs(url.query).items() } if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) else: import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend( list(sorted(vaex.file.glob(path, **kwargs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, convert=convert, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert(path_input=path, fs_options_input=fs_options, path_output=path_output, fs_options_output=fs_options, *args, **kwargs) ds = vaex.dataset.open(path_output, fs_options=fs_options) else: ds = vaex.dataset.open(path, fs_options=fs_options) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError( 'Could not open file: {}, did you install vaex-hdf5? Is the format supported?' .format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists( filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append( vaex.open(filename, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logging.getLogger("vaex").error("error opening %r" % path) raise
#%% import pandas as pd import os import vaex os.chdir(r'\\NAS\WDbackup\备份\Yaf') path = '//Nas/市场营销部/X销售数据留档/2020/2020-销售数据留档/' data = [] for file in os.listdir(path): df = pd.read_csv(path + file, encoding="gbk", thousands=",") df['date'] = file.split(".")[0].split("-")[1] df['date'] = df['date'].astype('datetime64[ns]') df['Ordered Product Sales'] = df['Ordered Product Sales'].str[1:] data.append(df) df = pd.concat(data) df.to_csv('FBA.csv') dv = vaex.from_csv('FBA.csv', convert=True, chunk_size=5_000_000) dv = vaex.open('FBA.csv.hdf5') print(dv.shape) print('------------------------------') # %%
def convert_file_to_hdf5(file_name): import vaex vaex.from_csv(file_name, convert=True, chunk_size=500_000)
def convert(filename): colnames = ['col']*136 for i in range(136): colnames[i] += str(i+1) df = vaex.from_csv(f'MSLR-WEB10K/Fold1/{filename}.txt', sep=' ', usecols=range(138), names=['relevance', 'qid'] + colnames) df.export(f'MSLR-WEB10K/Fold1/{filename}.arrow')
def test_diffent_extension(): df = vaex.from_csv(data_path / 'small2.nocsv') assert df.x.tolist() == [1, 3] df = vaex.from_csv(data_path / 'small2.nocsv', convert=True) assert df.x.tolist() == [1, 3]