def open_many(filenames): """Open a list of filenames, and return a dataset with all datasets cocatenated :param list[str] filenames: list of filenames/paths :rtype: Dataset """ datasets = [] for filename in filenames: filename = filename.strip() if filename and filename[0] != "#": datasets.append(open(filename)) return vaex.dataset.DatasetConcatenated(datasets=datasets)
def open(path, convert=False, shuffle=False, copy_index=True, *args, **kwargs): """Open a dataset from file given by path Example: >>> ds = vaex.open('sometable.hdf5') >>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str path: local or absolute path to file, or glob string :param convert: convert files to an hdf5 file for optimization, can also be a path :param bool shuffle: shuffle converted dataset or not :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :param bool copy_index: copy index when source is read via pandas :return: return dataset if file is supported, otherwise None :rtype: Dataset :Example: >>> import vaex as vx >>> vx.open('myfile.hdf5') <vaex.dataset.Hdf5MemoryMapped at 0x1136ee3d0> >>> vx.open('gadget_file.hdf5', 3) # this will read only particle type 3 <vaex.dataset.Hdf5MemoryMappedGadget at 0x1136ef3d0> """ import vaex try: if path in aliases: path = aliases[path] if path.startswith("http://") or path.startswith("ws://"): # TODO: think about https and wss server, dataset = path.rsplit("/", 1) server = vaex.server(server, **kwargs) datasets = server.datasets(as_dict=True) if dataset not in datasets: raise KeyError("no such dataset '%s' at server, possible dataset names: %s" % (dataset, " ".join(datasets.keys()))) return datasets[dataset] if path.startswith("cluster"): import vaex.distributed return vaex.distributed.open(path, *args, **kwargs) else: import vaex.file import glob # sort to get predicatable behaviour (useful for testing) filenames = list(sorted(glob.glob(path))) ds = None if len(filenames) == 0: raise IOError('Could not open file: {}, it does not exist'.format(path)) filename_hdf5 = _convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] ext = os.path.splitext(path)[1] if os.path.exists(filename_hdf5) and convert: # also check mtime? if convert: ds = vaex.file.open(filename_hdf5) else: ds = vaex.file.open(filename_hdf5, *args, **kwargs) else: if ext == '.csv': # special support for csv.. should probably approach it a different way ds = from_csv(path, copy_index=copy_index, **kwargs) else: ds = vaex.file.open(path, *args, **kwargs) if convert: ds.export_hdf5(filename_hdf5, shuffle=shuffle) ds = vaex.file.open(filename_hdf5) # argument were meant for pandas? if ds is None: if os.path.exists(path): raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path)) if os.path.exists(path): raise IOError('Could not open file: {}, it does not exist?'.format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = _convert_name(filenames, shuffle=shuffle) if os.path.exists(filename_hdf5) and convert: # also check mtime ds = open(filename_hdf5) else: # with ProcessPoolExecutor() as executor: # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs) datasets = [] for filename in filenames: datasets.append(open(filename, convert=bool(convert), shuffle=shuffle, **kwargs)) ds = vaex.dataset.DatasetConcatenated(datasets) if convert: ds.export_hdf5(filename_hdf5, shuffle=shuffle) ds = vaex.file.open(filename_hdf5, *args, **kwargs) if ds is None: raise IOError('Unknown error opening: {}'.format(path)) return ds except: logging.getLogger("vaex").error("error opening %r" % path) raise