Example #1
0
	def setUpClass(cls):
		global test_port
		cls.webserver = vaex.webserver.WebServer(datasets=[], port=test_port, cache_byte_size=0)
		#print "serving"
		cls.webserver.serve_threaded()
		#print "getting server object"
		scheme = "ws" if cls.use_websocket else "http"
		cls.server = vx.server("%s://localhost:%d" % (scheme, test_port))
		test_port += 1
Example #2
0
File: ui.py Project: yokeldd/vaex
 def setUpClass(cls):
     global test_port
     cls.webserver = vaex.webserver.WebServer(datasets=[],
                                              port=test_port,
                                              cache_byte_size=0)
     #print "serving"
     cls.webserver.serve_threaded()
     #print "getting server object"
     scheme = "ws" if cls.use_websocket else "http"
     cls.server = vx.server("%s://localhost:%d" % (scheme, test_port))
     test_port += 1
Example #3
0
def open(path, *args, **kwargs):
    """Open a dataset from file given by path

	:param str path: local or absolute path to file
	:param args: extra arguments for file readers that need it
	:param kwargs: extra keyword arguments
	:return: return dataset if file is supported, otherwise None
	:rtype: Dataset

	:Example:

	>>> import vaex as vx
	>>> vx.open('myfile.hdf5')
	<vaex.dataset.Hdf5MemoryMapped at 0x1136ee3d0>
	>>> vx.open('gadget_file.hdf5', 3) # this will read only particle type 3
	<vaex.dataset.Hdf5MemoryMappedGadget at 0x1136ef3d0>
	"""
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith(
                "ws://"):  # TODO: think about https and wss
            server, dataset = path.rsplit("/", 1)
            server = vaex.server(server, **kwargs)
            datasets = server.datasets(as_dict=True)
            if dataset not in datasets:
                raise KeyError(
                    "no such dataset '%s' at server, possible dataset names: %s"
                    % (dataset, " ".join(datasets.keys())))
            return datasets[dataset]
        if path.startswith("cluster"):
            import vaex.distributed
            return vaex.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            return vaex.file.open(path, *args, **kwargs)
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Example #4
0
def open(url, thread_mover=None):
    url = urlparse(url)
    assert url.scheme in ["cluster"]
    port = url.port
    base_path = url.path
    if base_path.startswith("/"):
        base_path = base_path[1:]
    clustername = url.hostname
    clusterlist = vaex.settings.cluster.get("clusters." + clustername, None)
    if clusterlist:
        datasets = []
        for hostname in clusterlist:
            try:
                server = vx.server(hostname, thread_mover=thread_mover)
                datasets_dict = server.datasets(as_dict=True)
            except socket.error as e:
                logger.info("could not connect to %s, skipping", hostname)
            else:
                dataset = datasets_dict[base_path]
                datasets.append(dataset)
            # datasets.append(vx.server(url).datasets()[0])
        dsd = DatasetDistributed(datasets=datasets)
        return dsd
Example #5
0
def open(url, thread_mover=None):
    url = urlparse(url)
    assert url.scheme in ["cluster"]
    port = url.port
    base_path = url.path
    if base_path.startswith("/"):
        base_path = base_path[1:]
    clustername = url.hostname
    clusterlist = vaex.settings.cluster.get("clusters." + clustername, None)
    if clusterlist:
        datasets = []
        for hostname in clusterlist:
            try:
                server = vx.server(hostname, thread_mover=thread_mover)
                datasets_dict = server.datasets(as_dict=True)
            except socket.error as e:
                logger.info("could not connect to %s, skipping", hostname)
            else:
                dataset = datasets_dict[base_path]
                datasets.append(dataset)
            #datasets.append(vx.server(url).datasets()[0])
        dsd = DatasetDistributed(datasets=datasets)
        return dsd
Example #6
0
                        task._result = task.reduce(task._results)
                        task.fulfill(task._result)
                        # remove references
                    task._result = None
                    task._results = None
                self.signal_end.emit()
                # if new tasks were added as a result of this, execute them immediately
                # TODO: we may want to include infinite recursion protection
                self._is_executing = False
                if len(self.task_queue) > 0:
                    logger.debug("task queue not empty.. start over!")
                    self.execute()
        finally:
            self._is_executing = False


if __name__ == "__main__":
    import vaex
    import sys
    vaex.set_log_level_debug()
    server = vaex.server(sys.argv[1], port=int(sys.argv[2]))
    datasets = server.datasets()
    print(datasets)
    dataset = datasets[0]
    dataset = vaex.example()
    print(dataset("x").minmax())
    dataset.select("x < 0")
    print(dataset.selected_length(), len(dataset))
    print(dataset("x").selected().is_masked)
    print(dataset("x").selected().minmax())
Example #7
0
def open(path, convert=False, shuffle=False, copy_index=True, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> ds = vaex.open('sometable.hdf5')
    >>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str path: local or absolute path to file, or glob string
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :param bool copy_index: copy index when source is read via pandas
    :return: return a DataFrame on succes, otherwise None
    :rtype: DataFrame

    """
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith("ws://"):  # TODO: think about https and wss
            server, DataFrame = path.rsplit("/", 1)
            server = vaex.server(server, **kwargs)
            DataFrames = server.DataFrames(as_dict=True)
            if DataFrame not in DataFrames:
                raise KeyError("no such DataFrame '%s' at server, possible DataFrame names: %s" % (DataFrame, " ".join(DataFrames.keys())))
            return DataFrames[DataFrame]
        if path.startswith("cluster"):
            import vaex.distributed
            return vaex.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            # sort to get predicatable behaviour (useful for testing)
            filenames = list(sorted(glob.glob(path)))
            ds = None
            if len(filenames) == 0:
                raise IOError('Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                ext = os.path.splitext(path)[1]
                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
                    if convert:
                        ds = vaex.file.open(filename_hdf5)
                    else:
                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
                else:
                    if ext == '.csv':  # special support for csv.. should probably approach it a different way
                        ds = from_csv(path, copy_index=copy_index, **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                    if convert:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path))
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
                if os.path.exists(filename_hdf5) and convert:  # also check mtime
                    ds = open(filename_hdf5)
                else:
                    # with ProcessPoolExecutor() as executor:
                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
                    DataFrames = []
                    for filename in filenames:
                        DataFrames.append(open(filename, convert=bool(convert), shuffle=shuffle, **kwargs))
                    ds = vaex.dataframe.DataFrameConcatenated(DataFrames)
                if convert:
                    ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)

        if ds is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return ds
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Example #8
0
                        task._result = task.reduce(task._results)
                        task.fulfill(task._result)
                        # remove references
                    task._result = None
                    task._results = None
                self.signal_end.emit()
                # if new tasks were added as a result of this, execute them immediately
                # TODO: we may want to include infinite recursion protection
                self._is_executing = False
                if len(self.task_queue) > 0:
                    logger.debug("task queue not empty.. start over!")
                    self.execute()
        finally:
            self._is_executing = False


if __name__ == "__main__":
    import vaex
    import sys
    vaex.set_log_level_debug()
    server = vaex.server(sys.argv[1], port=int(sys.argv[2]))
    datasets = server.datasets()
    print(datasets)
    dataset = datasets[0]
    dataset = vaex.example()
    print(dataset("x").minmax())
    dataset.select("x < 0")
    print(dataset.selected_length(), len(dataset))
    print(dataset("x").selected().is_masked)
    print(dataset("x").selected().minmax())
Example #9
0
def open(path, convert=False, shuffle=False, copy_index=True, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> ds = vaex.open('sometable.hdf5')
    >>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :param bool copy_index: copy index when source is read via pandas
    :return: return a DataFrame on succes, otherwise None
    :rtype: DataFrame

    """
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith("ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {key: values[0] for key, values in parse_qs(url.query).items()}
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            server = vaex.server(server, **kwargs)
            dataframe_map = server.datasets(as_dict=True)
            if name not in dataframe_map:
                raise KeyError("no such DataFrame '%s' at server, possible names: %s" % (name, " ".join(dataframe_map.keys())))
            return dataframe_map[name]
        if path.startswith("cluster"):
            import vaex.distributed
            return vaex.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, six.string_types):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                # TODO: can we do glob with s3?
                if path.startswith('s3://'):
                    filenames.append(path)
                else:
                    # sort to get predicatable behaviour (useful for testing)
                    filenames.extend(list(sorted(glob.glob(path))))
            ds = None
            if len(filenames) == 0:
                raise IOError('Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                naked_path = path
                if '?' in naked_path:
                    naked_path = naked_path[:naked_path.index('?')]
                ext = os.path.splitext(naked_path)[1]
                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
                    if convert:
                        ds = vaex.file.open(filename_hdf5)
                    else:
                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
                else:
                    if ext == '.csv' or naked_path.endswith(".csv.bz2"):  # special support for csv.. should probably approach it a different way
                        ds = from_csv(path, copy_index=copy_index, **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                    if convert:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path))
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
                if os.path.exists(filename_hdf5) and convert:  # also check mtime
                    ds = open(filename_hdf5)
                else:
                    # with ProcessPoolExecutor() as executor:
                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
                    DataFrames = []
                    for filename in filenames:
                        DataFrames.append(open(filename, convert=bool(convert), shuffle=shuffle, **kwargs))
                    ds = vaex.dataframe.DataFrameConcatenated(DataFrames)
                if convert:
                    ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)

        if ds is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return ds
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Example #10
0
def server(webserver):
    server = vaex.server("%s://localhost:%d" % (scheme, test_port))
    yield server
    server.close()
Example #11
0
def open(path, convert=False, shuffle=False, copy_index=True, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :param bool copy_index: copy index when source is read via pandas
    :return: return a DataFrame on succes, otherwise None
    :rtype: DataFrame

    S3 support:

    Vaex supports streaming in hdf5 files from Amazon AWS object storage S3.
    Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access
    it as fast as native disk access. The following url parameters control S3 options:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)
     * profile_name and other arguments are passed to :py:class:`s3fs.core.S3FileSystem`

    All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True)  # Note that anon is a boolean, not the string 'true'
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile_name=myprofile')

    """
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith("ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {key: values[0] for key, values in parse_qs(url.query).items()}
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            server = vaex.server(server, **kwargs)
            dataframe_map = server.datasets(as_dict=True)
            if name not in dataframe_map:
                raise KeyError("no such DataFrame '%s' at server, possible names: %s" % (name, " ".join(dataframe_map.keys())))
            return dataframe_map[name]
        if path.startswith("cluster"):
            import vaex.distributed
            return vaex.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, six.string_types):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                # TODO: can we do glob with s3?
                if path.startswith('s3://'):
                    filenames.append(path)
                else:
                    # sort to get predicatable behaviour (useful for testing)
                    filenames.extend(list(sorted(glob.glob(path))))
            ds = None
            if len(filenames) == 0:
                raise IOError('Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                naked_path = path
                if '?' in naked_path:
                    naked_path = naked_path[:naked_path.index('?')]
                ext = os.path.splitext(naked_path)[1]
                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
                    if convert:
                        ds = vaex.file.open(filename_hdf5)
                    else:
                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
                else:
                    if ext == '.csv' or naked_path.endswith(".csv.bz2"):  # special support for csv.. should probably approach it a different way
                        ds = from_csv(path, copy_index=copy_index, **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                    if convert and ds:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
                if os.path.exists(filename_hdf5) and convert:  # also check mtime
                    ds = open(filename_hdf5)
                else:
                    # with ProcessPoolExecutor() as executor:
                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
                    DataFrames = []
                    for filename in filenames:
                        DataFrames.append(open(filename, convert=bool(convert), shuffle=shuffle, **kwargs))
                    ds = vaex.dataframe.DataFrameConcatenated(DataFrames)
                if convert:
                    ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)

        if ds is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return ds
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Example #12
0
def open(path, convert=False, shuffle=False, copy_index=True, *args, **kwargs):
    """Open a dataset from file given by path

    Example:

    >>> ds = vaex.open('sometable.hdf5')
    >>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str path: local or absolute path to file, or glob string
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted dataset or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :param bool copy_index: copy index when source is read via pandas
    :return: return dataset if file is supported, otherwise None
    :rtype: Dataset

    :Example:

    >>> import vaex as vx
    >>> vx.open('myfile.hdf5')
    <vaex.dataset.Hdf5MemoryMapped at 0x1136ee3d0>
    >>> vx.open('gadget_file.hdf5', 3) # this will read only particle type 3
    <vaex.dataset.Hdf5MemoryMappedGadget at 0x1136ef3d0>
    """
    import vaex
    try:
        if path in aliases:
            path = aliases[path]
        if path.startswith("http://") or path.startswith("ws://"):  # TODO: think about https and wss
            server, dataset = path.rsplit("/", 1)
            server = vaex.server(server, **kwargs)
            datasets = server.datasets(as_dict=True)
            if dataset not in datasets:
                raise KeyError("no such dataset '%s' at server, possible dataset names: %s" % (dataset, " ".join(datasets.keys())))
            return datasets[dataset]
        if path.startswith("cluster"):
            import vaex.distributed
            return vaex.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            # sort to get predicatable behaviour (useful for testing)
            filenames = list(sorted(glob.glob(path)))
            ds = None
            if len(filenames) == 0:
                raise IOError('Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                ext = os.path.splitext(path)[1]
                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
                    if convert:
                        ds = vaex.file.open(filename_hdf5)
                    else:
                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
                else:
                    if ext == '.csv':  # special support for csv.. should probably approach it a different way
                        ds = from_csv(path, copy_index=copy_index, **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                    if convert:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path))
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
                if os.path.exists(filename_hdf5) and convert:  # also check mtime
                    ds = open(filename_hdf5)
                else:
                    # with ProcessPoolExecutor() as executor:
                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
                    datasets = []
                    for filename in filenames:
                        datasets.append(open(filename, convert=bool(convert), shuffle=shuffle, **kwargs))
                    ds = vaex.dataset.DatasetConcatenated(datasets)
                if convert:
                    ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)

        if ds is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return ds
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
__author__ = 'breddels'
import vaex as vx
import numpy as np
import pylab

server = vx.server("localhost")

list = server.list_datasets()
print list
ds = server.open(list[0])
print "length", len(ds)
subspace = ds("x", "y")
limits = subspace.limits_sigma(sigmas=3, square=True)

ds.select("z>50")
selected = subspace.selected()

print subspace.mean()
print subspace.var()
print subspace.limits_sigma()
print subspace.limits_sigma(sigmas=1)
#limits = subspace.minmax()
print "square limits", limits
grid = subspace.histogram(limits=limits)
grid_selected = selected.histogram(limits=limits)
subspace.plot(np.log(grid), limits=limits)
pylab.contour(np.log(grid_selected),
              2,
              linewidth="2pt",
              colors="blue",
              extent=limits.flatten(),
Example #14
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything")

    subparsers = parser.add_subparsers(help='type of task', dest="task")

    parser_add = subparsers.add_parser('add', help='add hosts to cluser')
    parser_add.add_argument("name", help="name of cluster")
    parser_add.add_argument("hostnames", help="hostnames", nargs="*")
    parser_add.add_argument('--reset', '-r', default=False, action='store_true', help="clear previous hosts")

    parser_check = subparsers.add_parser('check', help='check if hosts exists')
    parser_check.add_argument("name", help="name of cluster")
    parser_check.add_argument('--clean', '-c', default=False, action='store_true', help="remove hosts that are not up")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    quiet = args.quiet
    if args.task == "check":
        name = args.name
        clusterlist = vaex.settings.cluster.get("clusters." + name, None)
        if clusterlist is None:
            if not quiet:
                print("cluster does not exist: %s" % name)
        else:
            common = None
            for hostname in clusterlist:
                print(hostname)
                try:
                    server = vx.server(hostname)
                    datasets = server.datasets()
                except socket.error as e:
                    print("\t" + str(e))
                    if args.clean:
                        clusterlist.remove(hostname)
                else:
                    for dataset in datasets:
                        print("\t" + dataset.name)
                        # if common is None:
                    names = set([k.name for k in datasets])
                    common = names if common is None else common.union(names)
            print("Cluster: " + name + " has %d hosts connected, to connect to a dataset, use the following urls:" % (len(clusterlist)))
            for dsname in common or []:
                print("\tcluster://%s/%s" % (name, dsname))
            if args.clean:
                vaex.settings.cluster.store("clusters." + name, clusterlist)
    if args.task == "add":
        name = args.name
        clusterlist = vaex.settings.cluster.get("clusters." + name, [])
        if args.reset:
            clusterlist = []
        for hostname in args.hostnames:
            if hostname not in clusterlist:
                clusterlist.append(hostname)
        vaex.settings.cluster.store("clusters." + name, clusterlist)
        if not args.quiet:
            print("hosts in cluster: %s" % name)
            for hostname in clusterlist:
                print("\t%s" % (hostname))
Example #15
0
def server(webserver):
    server = vaex.server("%s://localhost:%d" % (scheme, test_port))
    yield server
    server.close()
Example #16
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help="do not output anything")

    subparsers = parser.add_subparsers(help='type of task', dest="task")

    parser_add = subparsers.add_parser('add', help='add hosts to cluser')
    parser_add.add_argument("name", help="name of cluster")
    parser_add.add_argument("hostnames", help="hostnames", nargs="*")
    parser_add.add_argument('--reset',
                            '-r',
                            default=False,
                            action='store_true',
                            help="clear previous hosts")

    parser_check = subparsers.add_parser('check', help='check if hosts exists')
    parser_check.add_argument("name", help="name of cluster")
    parser_check.add_argument('--clean',
                              '-c',
                              default=False,
                              action='store_true',
                              help="remove hosts that are not up")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    quiet = args.quiet
    if args.task == "check":
        name = args.name
        clusterlist = vaex.settings.cluster.get("clusters." + name, None)
        if clusterlist is None:
            if not quiet:
                print("cluster does not exist: %s" % name)
        else:
            common = None
            for hostname in clusterlist:
                print(hostname)
                try:
                    server = vx.server(hostname)
                    datasets = server.datasets()
                except socket.error as e:
                    print("\t" + str(e))
                    if args.clean:
                        clusterlist.remove(hostname)
                else:
                    for dataset in datasets:
                        print("\t" + dataset.name)
                        #if common is None:
                    names = set([k.name for k in datasets])
                    common = names if common is None else common.union(names)
            print(
                "Cluster: " + name +
                " has %d hosts connected, to connect to a dataset, use the following urls:"
                % (len(clusterlist)))
            for dsname in common:
                print("\tcluster://%s/%s" % (name, dsname))
            if args.clean:
                vaex.settings.cluster.store("clusters." + name, clusterlist)
    if args.task == "add":
        name = args.name
        clusterlist = vaex.settings.cluster.get("clusters." + name, [])
        if args.reset:
            clusterlist = []
        for hostname in args.hostnames:
            if hostname not in clusterlist:
                clusterlist.append(hostname)
        vaex.settings.cluster.store("clusters." + name, clusterlist)
        if not args.quiet:
            print("hosts in cluster: %s" % name)
            for hostname in clusterlist:
                print("\t%s" % (hostname))
Example #17
0
__author__ = 'breddels'
import vaex as vx
import numpy as np
import pylab


server = vx.server("localhost")


list = server.list_datasets()
print list
ds = server.open(list[0])
print "length", len(ds)
subspace = ds("x", "y")
limits = subspace.limits_sigma(sigmas=3, square=True)

ds.select("z>50")
selected = subspace.selected()


print subspace.mean()
print subspace.var()
print subspace.limits_sigma()
print subspace.limits_sigma(sigmas=1)
#limits = subspace.minmax()
print "square limits", limits
grid = subspace.histogram(limits=limits)
grid_selected = selected.histogram(limits=limits)
subspace.plot(np.log(grid), limits=limits)
pylab.contour(np.log(grid_selected), 2, linewidth="2pt", colors="blue", extent=limits.flatten(), alpha=0.8)