Beispiel #1
0
 def start(self):
     """
     Start multiple workerpools, possibly on remote servers via ssh,
     assuming there is an active streamer.
     """
     starting = []
     for host, cores in self.host_cores:
         if general.socket_ready((host, self.ctrl_port)):
             print('%s:%s already running' % (host, self.ctrl_port))
             continue
         ctrl_url = 'tcp://0.0.0.0:%s' % self.ctrl_port
         if host == '127.0.0.1':  # localhost
             args = [sys.executable]
         else:
             args = [
                 'ssh', '-f', '-T', f'{self.remote_user}@{host}',
                 self.remote_python
             ]
         args += [
             '-m', 'openquake.baselib.workerpool', ctrl_url, '-n', cores
         ]
         if host != '127.0.0.1':
             print('%s: if it hangs, check the ssh keys' % ' '.join(args))
         self.popens.append(subprocess.Popen(args))
         starting.append(host)
     return 'starting %s' % starting
Beispiel #2
0
 def set_concurrent_tasks_default(calc):
     """
     Set the default for concurrent_tasks based on the available
     worker pools .
     """
     num_workers = 0
     w = config.zworkers
     if w.host_cores:
         host_cores = [hc.split() for hc in w.host_cores.split(',')]
     else:
         host_cores = []
     for host, _cores in host_cores:
         url = 'tcp://%s:%s' % (host, w.ctrl_port)
         with z.Socket(url, z.zmq.REQ, 'connect') as sock:
             if not general.socket_ready(url):
                 logging.warning('%s is not running', host)
                 continue
             num_workers += sock.send('get_num_workers')
     if num_workers == 0:
         num_workers = os.cpu_count()
         logging.warning(
             'Missing host_cores, no idea about how many cores '
             'are available, using %d', num_workers)
     parallel.CT = num_workers * 2
     OqParam.concurrent_tasks.default = num_workers * 2
     logging.warning('Using %d zmq workers', num_workers)
Beispiel #3
0
    def start(self, streamer=False):
        """
        Start multiple workerpools, possibly on remote servers via ssh,
        and possibly a streamer, depending on the `streamercls`.

        :param streamer:
            if True, starts a streamer with multiprocessing.Process
        """
        if streamer and not general.socket_ready(self.task_in_url):  # started
            self.streamer = multiprocessing.Process(
                target=_streamer,
                args=(self.master_host, self.task_in_port, self.task_out_port))
            self.streamer.start()
        starting = []
        for host, cores in self.host_cores:
            if self.status(host)[0][1] == 'running':
                print('%s:%s already running' % (host, self.ctrl_port))
                continue
            ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port)
            if host == '127.0.0.1':  # localhost
                args = [sys.executable]
            else:
                args = ['ssh', host, self.remote_python]
            args += ['-m', 'openquake.baselib.workerpool',
                     ctrl_url, self.task_out_url, cores]
            starting.append(' '.join(args))
            po = subprocess.Popen(args)
            self.pids.append(po.pid)
        return 'starting %s' % starting
    def start(self, streamer=False):
        """
        Start multiple workerpools, possibly on remote servers via ssh,
        and possibly a streamer, depending on the `streamercls`.

        :param streamer:
            if True, starts a streamer with multiprocessing.Process
        """
        if streamer and not general.socket_ready(self.task_in_url):  # started
            self.streamer = multiprocessing.Process(target=_streamer,
                                                    args=(self.master_host,
                                                          self.task_in_port,
                                                          self.task_out_port))
            self.streamer.start()
        starting = []
        for host, cores in self.host_cores:
            if self.status(host)[0][1] == 'running':
                print('%s:%s already running' % (host, self.ctrl_port))
                continue
            ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port)
            if host == '127.0.0.1':  # localhost
                args = [sys.executable]
            else:
                args = ['ssh', host, self.remote_python]
            args += [
                '-m', 'openquake.baselib.workerpool', ctrl_url,
                self.task_out_url, cores
            ]
            starting.append(' '.join(args))
            po = subprocess.Popen(args)
            self.pids.append(po.pid)
        return 'starting %s' % starting
Beispiel #5
0
 def setUpClass(cls):
     cls.z = config.zworkers.copy()
     host_cores = '127.0.0.1 4'
     hostport = '127.0.0.1', int(cls.z['ctrl_port']) + 1
     if not socket_ready(hostport):
         raise unittest.SkipTest('The task streamer is off')
     cls.master = WorkerMaster('127.0.0.1', cls.z['ctrl_port'], host_cores)
     cls.master.start()
Beispiel #6
0
def get_status(address=None):
    """
    Check if the DbServer is up.

    :param address: pair (hostname, port)
    :returns: 'running' or 'not-running'
    """
    address = address or (config.dbserver.host, DBSERVER_PORT)
    return 'running' if socket_ready(address) else 'not-running'
Beispiel #7
0
def get_status(address=None):
    """
    Check if the DbServer is up.

    :param address: pair (hostname, port)
    :returns: 'running' or 'not-running'
    """
    address = address or (config.dbserver.host, DBSERVER_PORT)
    return 'running' if socket_ready(address) else 'not-running'
Beispiel #8
0
 def status(self, host=None):
     """
     :returns: a list of pairs (hostname, 'running'|'not-running')
     """
     if host is None:
         host_cores = self.host_cores
     else:
         host_cores = [hc for hc in self.host_cores if hc[0] == host]
     lst = []
     for host, _ in host_cores:
         ready = general.socket_ready((host, self.ctrl_port))
         lst.append((host, 'running' if ready else 'not-running'))
     return lst
Beispiel #9
0
 def status(self, host=None):
     """
     :returns: a list of pairs (hostname, 'running'|'not-running')
     """
     if host is None:
         host_cores = self.host_cores
     else:
         host_cores = [hc for hc in self.host_cores if hc[0] == host]
     lst = []
     for host, _ in host_cores:
         ready = general.socket_ready((host, self.ctrl_port))
         lst.append((host, 'running' if ready else 'not-running'))
     return lst
Beispiel #10
0
def check_status(**kw):
    """
    :returns: a non-empty error string if the streamer or worker pools are down
    """
    c = config.zworkers.copy()
    c.update(kw)
    hostport = config.dbserver.listen, int(c['ctrl_port']) + 1
    errors = []
    if not general.socket_ready(hostport):
        errors.append('The task streamer on %s:%s is down' % hostport)
    for host, status in WorkerMaster(**c).status():
        if status != 'running':
            errors.append('The workerpool on %s is down' % host)
    return '\n'.join(errors)
Beispiel #11
0
 def status(self):
     """
     :returns: a list [(host, running, total), ...]
     """
     executing = []
     for host, _cores in self.host_cores:
         if not general.socket_ready((host, self.ctrl_port)):
             continue
         ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port)
         with z.Socket(ctrl_url, z.zmq.REQ, 'connect') as sock:
             running = len(sock.send('get_executing').split())
             total = sock.send('get_num_workers')
             executing.append((host, running, total))
     return executing
Beispiel #12
0
 def set_concurrent_tasks_default(job_id):
     """
     Set the default for concurrent_tasks based on the available
     worker pools .
     """
     num_workers = 0
     w = config.zworkers
     for host, _cores in [hc.split() for hc in w.host_cores.split(',')]:
         url = 'tcp://%s:%s' % (host, w.ctrl_port)
         with z.Socket(url, z.zmq.REQ, 'connect') as sock:
             if not general.socket_ready(url):
                 logs.LOG.warn('%s is not running', host)
                 continue
             num_workers += sock.send('get_num_workers')
     OqParam.concurrent_tasks.default = num_workers * 2
     logs.LOG.warn('Using %d zmq workers', num_workers)
Beispiel #13
0
 def kill(self):
     """
     Send a "kill" command to all worker pools
     """
     killed = []
     for host, _ in self.host_cores:
         if not general.socket_ready((host, self.ctrl_port)):
             continue
         ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port)
         with z.Socket(ctrl_url, z.zmq.REQ, 'connect') as sock:
             sock.send('kill')
             killed.append(host)
     for popen in self.popens:
         popen.kill()
     self.popens = []
     return 'killed %s' % killed
Beispiel #14
0
 def set_concurrent_tasks_default(job_id):
     """
     Set the default for concurrent_tasks based on the available
     worker pools .
     """
     num_workers = 0
     w = config.zworkers
     for host, _cores in [hc.split() for hc in w.host_cores.split(',')]:
         url = 'tcp://%s:%s' % (host, w.ctrl_port)
         with z.Socket(url, z.zmq.REQ, 'connect') as sock:
             if not general.socket_ready(url):
                 logs.LOG.warn('%s is not running', host)
                 continue
             num_workers += sock.send('get_num_workers')
     OqParam.concurrent_tasks.default = num_workers * 3
     logs.LOG.warn('Using %d zmq workers', num_workers)
Beispiel #15
0
 def stop(self):
     """
     Send a "stop" command to all worker pools
     """
     stopped = []
     for host, _ in self.host_cores:
         if not general.socket_ready((host, self.ctrl_port)):
             continue
         ctrl_url = 'tcp://%s:%s' % (host, self.ctrl_port)
         with z.Socket(ctrl_url, z.zmq.REQ, 'connect') as sock:
             sock.send('stop')
             stopped.append(host)
     for popen in self.popens:
         popen.terminate()
         # since we are not consuming any output from the spawned process
         # we must call wait() after terminate() to have Popen()
         # fully deallocate the process file descriptors, otherwise
         # zombies will arise
         popen.wait()
     self.popens = []
     return 'stopped %s' % stopped