Example #1
0
 def __exit__(self, _type, _value, _traceback):
     logging.log(1, "__exit__({}, {}, {})".format(_type, _value,
                                                  _traceback))
     if self._exited:
         return
     # clean up any remaining runner processes
     if _type or _value or _traceback:
         logging.error('An error was raised in the Cluster context',
                       exc_info=(_type, _value, _traceback))
         self.stop_background_threads(_value)
     else:
         self.stop_background_threads()
     try:
         for w in self.workers:
             w.stop()
         for w in self.dead_workers:
             w.stop()
         # Wait on runners to finish waiting on their subprocesses to exit
         for w in self.runners:
             # Check thread ident to avoid error when joining an un-started
             # thread.
             if w.ident:  # ident is set when a thread is started
                 w.join(self.worker_join_timeout)
         alive = []
         while self.workers:
             w = self.workers.pop()
             if w.is_alive():
                 alive.append(w)
             else:
                 self.dead_workers.append(w)
         if alive:
             alive_names = ', '.join((w.name for w in alive))
             raise ClusterError("Runners [{}] failed to exit cleanly after"
                                " {} seconds.".format(
                                    alive_names, self.worker_join_timeout))
         # check for workes that exited with a non-0 return code
         # note that workers killed in the previous step have code -15
         bad_exit = []
         for w in self.dead_workers:
             c = w.returncode()
             if c not in (0, -9, -15):  # -9: SIGKILL, -15: SIGTERM
                 bad_exit.append(w)
         if bad_exit:
             raise ClusterError("The following workers terminated with "
                                "a bad exit code: {}".format([
                                    "(name: {}, pid: {}, code:{})".format(
                                        w.name, w.pid, w.returncode())
                                    for w in bad_exit
                                ]))
     finally:
         self.__finally__()
         if self.errors:
             logging.error("Errors were encountered when running"
                           " the cluster")
             for e in self.errors:
                 logging.exception(e)
         self._exited = True
Example #2
0
 def run(self):
     while not self.stopped():
         if self.func:
             crashed = self.cluster.get_crashed_workers(self.func)
         else:
             crashed = self.cluster.get_crashed_workers()
         if crashed:
             logging.debug("CrashChecker, results: {}".format(crashed))
             err = ClusterError(
                 "A crash was detected in the workers: {}".format(",".join(
                     ("{} ({}): {}").format(w.name, w.pid, w.returncode())
                     for w in crashed)))
             self.cluster.raise_from_error(err)
             self.stop()
             break
         time.sleep(0.5)
Example #3
0
def add_runner(worker_id,
               runners,
               command,
               source_addrs,
               sink_addrs,
               metrics_addr,
               control_addr,
               res_dir,
               workers,
               my_control_addr,
               my_data_addr,
               my_external_addr,
               alt_block=None,
               alt_func=lambda x: False,
               spikes={}):
    cmd_stub = BASE_COMMAND.format(
        command=command,
        in_block=(IN_BLOCK.format(
            inputs=','.join(source_addrs)) if source_addrs else ''),
        out_block=(OUT_BLOCK.format(
            outputs=','.join(sink_addrs)) if sink_addrs else ''),
        metrics_addr=metrics_addr,
        res_dir=res_dir)

    # Test that the new worker *can* join
    if len(runners) < 1:
        raise ClusterError("There must be at least 1 worker to join!")

    if not any(r.is_alive() for r in runners):
        raise ClusterError("There must be at least 1 live worker to " "join!")

    if worker_id in spikes:
        logging.info("Enabling spike for joining worker{}".format(x))
        sc = spikes[worker_id]
        spike_block = SPIKE_CMD.format(
            prob=SPIKE_PROB.format(sc.probability),
            margin=SPIKE_MARGIN.format(sc.margin),
            seed=SPIKE_SEED.format(sc.seed) if sc.seed else '')
    else:
        spike_block = ''

    cmd = cmd_stub.format(name='worker{}'.format(worker_id),
                          initializer_block='',
                          worker_block=WORKER_CMD.format(
                              control_addr=my_control_addr,
                              data_addr=my_data_addr,
                              external_addr=my_external_addr),
                          join_block=JOIN_CMD.format(
                              join_addr=control_addr,
                              worker_count=(WORKER_COUNT_CMD.format(
                                  worker_count=workers) if workers else '')),
                          alt_block=alt_block if alt_func(worker_id) else '',
                          spike_block=spike_block)
    runner = Runner(command=cmd,
                    name='worker{}'.format(worker_id),
                    control=my_control_addr,
                    data=my_data_addr,
                    external=my_external_addr)
    runners.append(runner)

    # start the new worker
    runner.start()
    time.sleep(0.05)

    # check the runner hasn't exited with any errors
    try:
        assert (runner.is_alive())
    except Exception as err:
        raise CrashedWorkerError
    try:
        assert (runner.error is None)
    except Exception as err:
        raise err
    return runner
Example #4
0
def start_runners(runners,
                  command,
                  source_addrs,
                  sink_addrs,
                  metrics_addr,
                  res_dir,
                  workers,
                  worker_addrs=[],
                  alt_block=None,
                  alt_func=lambda x: False,
                  spikes={}):
    cmd_stub = BASE_COMMAND.format(
        command=command,
        in_block=(IN_BLOCK.format(
            inputs=','.join(source_addrs)) if source_addrs else ''),
        out_block=(OUT_BLOCK.format(
            outputs=','.join(sink_addrs)) if sink_addrs else ''),
        metrics_addr=metrics_addr,
        res_dir=res_dir)

    # for each worker, assign `name` and `cluster-initializer` values
    if workers < 1:
        raise ClusterError("workers must be 1 or more")
    x = 0
    if x in spikes:
        logging.info("Enabling spike for initializer")
        sc = spikes[x]
        spike_block = SPIKE_CMD.format(
            prob=SPIKE_PROB.format(prob=sc.probability),
            margin=SPIKE_MARGIN.format(margin=sc.margin),
            seed=SPIKE_SEED.format(seed=sc.seed) if sc.seed else '')
    else:
        spike_block = ''
    cmd = cmd_stub.format(
        name='initializer',
        initializer_block=INITIALIZER_CMD.format(
            worker_count=WORKER_COUNT_CMD.format(worker_count=workers),
            data_addr=worker_addrs[0][1],
            external_addr=worker_addrs[0][2]),
        worker_block='',
        join_block=CONTROL_CMD.format(control_addr=worker_addrs[0][0]),
        alt_block=alt_block if alt_func(x) else '',
        spike_block=spike_block)
    runners.append(
        Runner(command=cmd,
               name='initializer',
               control=worker_addrs[0][0],
               data=worker_addrs[0][1],
               external=worker_addrs[0][2]))
    for x in range(1, workers):
        if x in spikes:
            logging.info("Enabling spike for worker{}".format(x))
            sc = spikes[x]
            spike_block = SPIKE_CMD.format(
                prob=SPIKE_PROB.format(prob=sc.probability),
                margin=SPIKE_MARGIN.format(margin=sc.margin),
                seed=SPIKE_SEED.format(seed=sc.seed) if sc.seed else '')
        else:
            spike_block = ''
        cmd = cmd_stub.format(
            name='worker{}'.format(x),
            initializer_block='',
            worker_block=WORKER_CMD.format(control_addr=worker_addrs[x][0],
                                           data_addr=worker_addrs[x][1],
                                           external_addr=worker_addrs[x][2]),
            join_block=CONTROL_CMD.format(control_addr=worker_addrs[0][0]),
            alt_block=alt_block if alt_func(x) else '',
            spike_block=spike_block)
        runners.append(
            Runner(command=cmd,
                   name='worker{}'.format(x),
                   control=worker_addrs[x][0],
                   data=worker_addrs[x][1],
                   external=worker_addrs[x][2]))

    # start the workers, 50ms apart
    for idx, r in enumerate(runners):
        r.start()
        time.sleep(0.05)

    # check the runners haven't exited with any errors
    for idx, r in enumerate(runners):
        try:
            assert (r.is_alive())
        except RunnerHasntStartedError:
            raise
        except Exception as err:
            stdout = r.get_output()
            raise ClusterError("Runner %d of %d has exited with an error: "
                               "\n---\n%s" % (idx + 1, len(runners), stdout))
        try:
            assert (r.error is None)
        except Exception as err:
            raise ClusterError("Runner %d of %d has exited with an error: "
                               "\n---\n%s" % (idx + 1, len(runners), r.error))