Exemple #1
0
    def start_tasks(self, **kwargs):
        try:
            logger.info('Starting CachingDataController')

            # Add data controller to the queue first so that it
            # can get the initial data and is not blocked
            data_controller = CachingDataController(self.hydrodataset,
                                                    self.common_variables,
                                                    self.n_run,
                                                    self.get_data,
                                                    self.write_lock,
                                                    self.has_write_lock,
                                                    self.read_lock,
                                                    self.read_count,
                                                    self.time_chunk,
                                                    self.horiz_chunk,
                                                    self.times,
                                                    self.start,
                                                    self.point_get,
                                                    self.reference_location,
                                                    cache_path=self.cache_path)
            self.tasks.put(data_controller)
            # Create CachingDataController worker
            self.data_controller_process = Consumer(
                self.tasks,
                self.results,
                self.n_run,
                self.nproc_lock,
                self.active,
                self.get_data,
                name="CachingDataController")
            self.data_controller_process.start()

            logger.info('Adding %i particles as tasks' %
                        self.total_particle_count())

            for part in self.particles:
                forcer = CachingForcer(
                    self.cache_path,
                    particle=part,
                    common_variables=self.common_variables,
                    timevar=self.timevar,
                    times=self.times,
                    start_time=self.start,
                    models=self._models,
                    release_location_centroid=self.reference_location.point,
                    usebathy=self._use_bathymetry,
                    useshore=self._use_shoreline,
                    usesurface=self._use_seasurface,
                    reverse_distance=self.reverse_distance,
                    bathy_path=self.bathy_path,
                    shoreline_path=self.shoreline_path,
                    shoreline_feature=self.shoreline_feature,
                    time_method=self.time_method,
                    shoreline_index_buffer=self.shoreline_index_buffer,
                    get_data=self.get_data,
                    read_lock=self.read_lock,
                    has_read_lock=self.has_read_lock,
                    read_count=self.read_count,
                    point_get=self.point_get,
                    data_request_lock=self.data_request_lock,
                    has_data_request_lock=self.has_data_request_lock)
                self.tasks.put(forcer)

            # Create workers for the particles.
            self.procs = [
                Consumer(self.tasks,
                         self.results,
                         self.n_run,
                         self.nproc_lock,
                         self.active,
                         self.get_data,
                         name="CachingForcer-%d" % i)
                for i in range(self.nproc - 1)
            ]
            logger.progress((5, 'Running model'))
            for w in self.procs:
                w.start()
                logger.info('Started %s' % w.name)

            return True

        except Exception:
            logger.exception("Something didn't start correctly!")
            return False
Exemple #2
0
    def listen_for_results(self, output_h5_file, total_particles):
        try:
            # Get results back from queue, test for failed particles
            return_particles = []
            retrieved = 0.
            self.error_code = 0

            logger.info("Waiting for %i particle results" % total_particles)
            while retrieved < self.total_task_count(
            ):  # One for the CachingDataController

                logger.info("looping in listen_for_results")

                try:
                    # Returns a tuple of code, result
                    code, tempres = self.results.get(timeout=240)
                except queue.Empty:
                    # Poll the active processes to make sure they are all alive and then continue with loop
                    if not self.data_controller_process.is_alive(
                    ) and self.data_controller_process.exitcode != 0:
                        # Data controller is zombied, kill off other processes.
                        self.get_data.value is False
                        self.results.put((-2, "CachingDataController"))

                    new_procs = []
                    old_procs = []
                    for p in self.procs:
                        if not p.is_alive() and p.exitcode != 0:
                            # Do what the Consumer would do if something finished.
                            # Add something to results queue
                            self.results.put((-3, "ZombieParticle"))
                            # Decrement nproc (CachingDataController exits when this is 0)
                            with self.nproc_lock:
                                self.n_run.value = self.n_run.value - 1

                            # Remove task from queue (so they can be joined later on)
                            self.tasks.task_done()

                            # Start a new Consumer.  It will exit if there are no tasks available.
                            np = Consumer(self.tasks,
                                          self.results,
                                          self.n_run,
                                          self.nproc_lock,
                                          self.active,
                                          self.get_data,
                                          name=p.name)
                            new_procs.append(np)
                            old_procs.append(p)

                            # Release any locks the PID had
                            if p.pid in self.has_read_lock:
                                with self.read_lock:
                                    self.read_count.value -= 1
                                    self.has_read_lock.remove(p.pid)

                            if self.has_data_request_lock.value == p.pid:
                                self.has_data_request_lock.value = -1
                                try:
                                    self.data_request_lock.release()
                                except:
                                    pass

                            if self.has_write_lock.value == p.pid:
                                self.has_write_lock.value = -1
                                try:
                                    self.write_lock.release()
                                except:
                                    pass

                    for p in old_procs:
                        try:
                            self.procs.remove(p)
                        except ValueError:
                            logger.warn(
                                "Did not find %s in the list of processes.  Continuing on."
                                % p.name)

                    for p in new_procs:
                        self.procs.append(p)
                        logger.warn(
                            "Started a new consumer (%s) to replace a zombie consumer"
                            % p.name)
                        p.start()

                else:
                    # We got one.
                    retrieved += 1
                    if code is None:
                        logger.warn(
                            "Got an unrecognized response from a task.")
                    elif code == -1:
                        logger.warn("Particle %s has FAILED!!" % tempres.uid)
                    elif code == -2:
                        self.error_code = code
                        logger.warn(
                            "CachingDataController has FAILED!!  Removing cache file so the particles fail."
                        )
                        try:
                            os.remove(self.cache_path)
                        except OSError:
                            logger.debug(
                                "Could not remove cache file, it probably never existed"
                            )
                            pass
                    elif code == -3:
                        self.error_code = code
                        logger.info(
                            "A zombie process was caught and task was removed from queue"
                        )
                    elif isinstance(tempres, Particle):
                        logger.info("Particle %d finished" % tempres.uid)
                        return_particles.append(tempres)
                        # We mulitply by 95 here to save 5% for the exporting
                        logger.progress(
                            (round((retrieved / self.total_task_count()) * 90.,
                                   1), "Particle %d finished" % tempres.uid))
                    elif tempres == "CachingDataController":
                        logger.info("CachingDataController finished")
                        logger.progress(
                            (round((retrieved / self.total_task_count()) * 90.,
                                   1), "CachingDataController finished"))
                    else:
                        logger.info("Got a strange result on results queue")
                        logger.info(str(tempres))

                    logger.info("Retrieved %i/%i results" %
                                (int(retrieved), self.total_task_count()))

                # Relax
                time.sleep(1)

            if len(return_particles) != total_particles:
                logger.warn(
                    "Some particles failed and are not included in the output")

            # The results queue should be empty at this point
            assert self.results.empty() is True

            # Should be good to join on the tasks now that the queue is empty
            logger.info("Joining the task queue")
            self.tasks.join()
            self.tasks.close()
            self.tasks.join_thread()

        finally:
            # Join all processes
            logger.info("Joining the processes")
            for w in self.procs + [self.data_controller_process]:
                # Wait 20 seconds
                w.join(20.)
                if w.is_alive():
                    # Process is hanging, kill it.
                    logger.info(
                        "Terminating %s forcefully.  This should have exited itself."
                        % w.name)
                    w.terminate()

        if self.error_code == -2:
            raise ValueError(
                "Error in the BaseDataController (error_code was -2)")

        results = ex.ResultsPyTable(output_h5_file)
        for p in return_particles:
            for x in range(len(p.locations)):
                results.write(p.timestep_index_dump(x))
        results.compute()
        results.close()

        return