Example #1
0
def open_many(filenames):
    """Open a list of filenames, and return a dataset with all datasets cocatenated

	:param list[str] filenames: list of filenames/paths
	:rtype: Dataset
	"""
    datasets = []
    for filename in filenames:
        filename = filename.strip()
        if filename and filename[0] != "#":
            datasets.append(open(filename))
    return vaex.dataset.DatasetConcatenated(datasets=datasets)
Example #2
0
def open(path, convert=False, shuffle=False, copy_index=True, *args, **kwargs):
    """Open a dataset from file given by path

    Example:

    >>> ds = vaex.open('sometable.hdf5')
    >>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str path: local or absolute path to file, or glob string
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted dataset or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :param bool copy_index: copy index when source is read via pandas
    :return: return dataset if file is supported, otherwise None
    :rtype: Dataset

    :Example:

    >>> import vaex as vx
    >>> vx.open('myfile.hdf5')
    <vaex.dataset.Hdf5MemoryMapped at 0x1136ee3d0>
    >>> vx.open('gadget_file.hdf5', 3) # this will read only particle type 3
    <vaex.dataset.Hdf5MemoryMappedGadget at 0x1136ef3d0>
    """
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith("ws://"):  # TODO: think about https and wss
            server, dataset = path.rsplit("/", 1)
            server = vaex.server(server, **kwargs)
            datasets = server.datasets(as_dict=True)
            if dataset not in datasets:
                raise KeyError("no such dataset '%s' at server, possible dataset names: %s" % (dataset, " ".join(datasets.keys())))
            return datasets[dataset]
        if path.startswith("cluster"):
            import vaex.distributed
            return vaex.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            # sort to get predicatable behaviour (useful for testing)
            filenames = list(sorted(glob.glob(path)))
            ds = None
            if len(filenames) == 0:
                raise IOError('Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                ext = os.path.splitext(path)[1]
                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
                    if convert:
                        ds = vaex.file.open(filename_hdf5)
                    else:
                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
                else:
                    if ext == '.csv':  # special support for csv.. should probably approach it a different way
                        ds = from_csv(path, copy_index=copy_index, **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                    if convert:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path))
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
                if os.path.exists(filename_hdf5) and convert:  # also check mtime
                    ds = open(filename_hdf5)
                else:
                    # with ProcessPoolExecutor() as executor:
                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
                    datasets = []
                    for filename in filenames:
                        datasets.append(open(filename, convert=bool(convert), shuffle=shuffle, **kwargs))
                    ds = vaex.dataset.DatasetConcatenated(datasets)
                if convert:
                    ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)

        if ds is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return ds
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise