Ejemplo n.º 1
0
    def __init__(self,
                 pipe_id,
                 runs,
                 working_dir,
                 total_nodes,
                 machine_name,
                 kill_on_partial_failure=False,
                 post_process_script=None,
                 post_process_args=None,
                 post_process_stop_on_failure=False,
                 node_layout=None,
                 launch_mode=None):
        self.id = pipe_id
        self.runs = runs
        self.working_dir = working_dir
        self.kill_on_partial_failure = kill_on_partial_failure
        self.post_process_script = post_process_script
        self.post_process_args = post_process_args
        self.post_process_stop_on_failure = post_process_stop_on_failure
        self.node_layout = node_layout
        self.machine_name = machine_name

        self._state_lock = threading.Lock()
        self._running = False
        self._force_killed = False
        self._active_runs = set()

        self._pipe_thread = None
        self._post_thread = None
        self.done_callbacks = set()
        self.fatal_callbacks = set()
        self.total_procs = 0
        self.log_prefix = self.id
        self._start_time = None
        self._walltime_path = self.working_dir + "/codar.savanna.total.walltime"

        for run in runs:
            self.total_procs += run.nprocs
            run.log_prefix = "%s:%s" % (self.id, run.name)
            run.machine = machines.get_by_name(machine_name)
        # requires ppn to determine, in case node layout is not specified
        self.total_nodes = total_nodes
        self.launch_mode = launch_mode

        # List of node IDs assigned to this pipeline. Get initialized in
        # start()
        self.nodes_assigned = Queue()

        # Keep a copy of the nodes assigned. Pop nodes from this queue to
        # assign to Runs. When a Run is done, it can push the node back into
        # this queue, but Runs that share nodes may add the same node to the
        # Queue multiple times. We need a SetQueue for this, or a way to
        # have all Runs in a shared node release nodes just once.
        self._nodes_assigned = Queue()

        # Reorder the runs list so that runs are listed according to their
        # dependencies
        self.reorder_runs_by_dependencies()
Ejemplo n.º 2
0
 def _get_machine(self, machine_name):
     machine = None
     for m in self.supported_machines:
         if m == machine_name:
             machine = machines.get_by_name(m)
     if machine is None:
         raise exc.CheetahException(
             "machine '%s' not supported by experiment '%s'" %
             (machine_name, self.name))
     return machine
Ejemplo n.º 3
0
    def start(self, consumer, nodes_assigned, runner=None):
        # Mark all runs as active before they are actually started
        # in a separate thread, so other methods know the state.

        for node_name in nodes_assigned:
            self.nodes_assigned.put(node_name)
            machine = machines.get_by_name(self.machine_name)
            # self.nodes_assigned.put(machine.node_class(node_name))

        # Make a copy of the nodes_assigned. copy.deepcopy does not work
        # self._nodes_assigned = copy.deepcopy(self.nodes_assigned)
        for node in list(self.nodes_assigned.queue):
            self._nodes_assigned.put(node)

        self.add_done_callback(consumer.pipeline_finished)
        self.add_fatal_callback(consumer.pipeline_fatal)

        with self._state_lock:
            for run in self.runs:
                run.set_runner(runner)
            if self.launch_mode:
                if self.launch_mode.lower() == 'mpmd':
                    mpmd_run = Run.mpmd_run(self.runs)
                    mpmd_run.nodes = self.total_nodes
                    self.runs = [mpmd_run]
            for run in self.runs:
                run.set_runner(runner)

                # Commenting out the callback to consumer.run_finished that
                # releases the nodes held by a run.
                # Now this is called when the pipeline finishes

                # run.add_callback(consumer.run_finished)

                run.add_callback(self.run_finished)
                self._active_runs.add(run)
            self._running = True

            # Parse the node layout and set the run information.
            # This requires self.nodes_assigned .
            # Only for Summit right now.
            self._parse_node_layouts()

            # Next start pipeline runs in separate thread and return
            # immediately, so we can inject a wait time between starting runs.
            self._pipe_thread = threading.Thread(target=self._start)
            self._pipe_thread.start()