Example #1
0
    def test_failure_counters_with_valid_area(self):
        # Failure counters are returned for valid computation areas.
        stats.delete_job_counters(123)
        fcname = itertools.cycle(string.ascii_lowercase)
        for cidx, carea in enumerate(["g", "h", "r"]):
            stats.incr_counter(123, carea, "%s:failed" % fcname.next())
            if not (cidx % 2):
                stats.incr_counter(123, carea, "%s:failed" % fcname.next())

        self.assertEqual([('oqs/123/g/a:failed/i', 1),
                          ('oqs/123/g/b:failed/i', 1)],
                         sorted(stats.failure_counters(123, "g")))
        self.assertEqual([('oqs/123/h/c:failed/i', 1)],
                         sorted(stats.failure_counters(123, "h")))
        self.assertEqual([('oqs/123/r/d:failed/i', 1),
                          ('oqs/123/r/e:failed/i', 1)],
                         sorted(stats.failure_counters(123, "r")))
Example #2
0
    def test_failure_counters_with_valid_area(self):
        # Failure counters are returned for valid computation areas.
        stats.delete_job_counters(123)
        fcname = itertools.cycle(string.ascii_lowercase)
        for cidx, carea in enumerate(["g", "h", "r"]):
            stats.incr_counter(123, carea, "%s-failures" % fcname.next())
            if not (cidx % 2):
                stats.incr_counter(123, carea, "%s-failures" % fcname.next())

        self.assertEqual(
            [('oqs/123/g/a-failures/i', 1), ('oqs/123/g/b-failures/i', 1)],
            sorted(stats.failure_counters(123, "g")))
        self.assertEqual([('oqs/123/h/c-failures/i', 1)],
                         sorted(stats.failure_counters(123, "h")))
        self.assertEqual(
            [('oqs/123/r/d-failures/i', 1), ('oqs/123/r/e-failures/i', 1)],
            sorted(stats.failure_counters(123, "r")))
Example #3
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                failed_nodes = abort_due_to_failed_nodes(self.job_id)
                if failed_nodes:
                    message = ("job terminated due to %s failed nodes" %
                               failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'complete':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.debug(message)
            elif not job_status == 'complete':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status_and_error_msg(self.job_id, error_msg=message)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id)
            raise StopIteration()
Example #4
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                failed_nodes = abort_due_to_failed_nodes(self.job_id)
                if failed_nodes:
                    message = ("job terminated due to %s failed nodes" %
                               failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'succeeded':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.info(message)
            elif job_status == 'running':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status_and_error_msg(self.job_id, 'failed', message)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id)
            raise StopIteration()
    def test_failure_counters_with_no_area(self):
        # Failure counters are returned for all computation areas if the
        # 'area' parameter is omitted.
        stats.delete_job_counters(123)
        fcname = itertools.cycle(string.ascii_lowercase)
        for cidx, carea in enumerate(["g", "h", "r"]):
            stats.incr_counter(123, carea, "%s:failed" % fcname.next())
            if not (cidx % 2):
                stats.incr_counter(123, carea, "%s:failed" % fcname.next())

        self.assertEqual(
            [('oqs/123/g/a:failed/i', 1), ('oqs/123/g/b:failed/i', 1),
             ('oqs/123/h/c:failed/i', 1), ('oqs/123/r/d:failed/i', 1),
             ('oqs/123/r/e:failed/i', 1)],
            sorted(stats.failure_counters(123)))
Example #6
0
    def test_failure_counters_with_no_area(self):
        # Failure counters are returned for all computation areas if the
        # 'area' parameter is omitted.
        stats.delete_job_counters(123)
        fcname = itertools.cycle(string.ascii_lowercase)
        for cidx, carea in enumerate(["g", "h", "r"]):
            stats.incr_counter(123, carea, "%s-failures" % fcname.next())
            if not (cidx % 2):
                stats.incr_counter(123, carea, "%s-failures" % fcname.next())

        self.assertEqual([('oqs/123/g/a-failures/i', 1),
                          ('oqs/123/g/b-failures/i', 1),
                          ('oqs/123/h/c-failures/i', 1),
                          ('oqs/123/r/d-failures/i', 1),
                          ('oqs/123/r/e-failures/i', 1)],
                         sorted(stats.failure_counters(123)))
Example #7
0
    def ath(self, sites, rtype, datum=None):
        """
        Write calculation results to the database.

        :param sites: the sites for which to write calculation results.
        :type sites: list of :py:class:`openquake.shapes.Site`
        :param str rtype: hazard curve type, one of: curve, mean, quantile
        :param datum: one of: realization, None, quantile
        """

        def pause_generator(value):
            """
            Returns the initial value when called for the first time and
            the double value upon each subsequent invocation.

            N.B.: the maximum value returned will never exceed 90 (seconds).
            """
            yield value
            while True:
                if value < 45:
                    value *= 2
                yield value

        sites = set(sites)
        accounted_for = set()
        min_pause = 0.1
        pgen = pause_generator(min_pause)
        pause = pgen.next()

        key_template, nrml_path, hc_meta = psha_exp.hcs_meta(
            self.job_ctxt, rtype, datum)

        curve_writer = hazard_output.HazardCurveDBWriter(
            nrml_path, self.job_ctxt.job_id)

        while accounted_for != sites:
            failures = stats.failure_counters(self.job_ctxt.job_id, "h")
            if failures:
                raise RuntimeError("hazard failures (%s), aborting" % failures)
            hc_data = []
            # Sleep a little before checking the availability of additional
            # hazard curve results.
            time.sleep(pause)
            results_found = 0
            for site in sites:
                if site in accounted_for:
                    continue
                value = kvs.get_value_json_decoded(key_template % hash(site))
                if value is None:
                    # No value yet, proceed to next site.
                    continue
                # Use hazard curve ordinate values (PoE) from KVS and abscissae
                # from the IML list in config.
                hc_attrib = {
                    'investigationTimeSpan':
                        self.job_ctxt['INVESTIGATION_TIME'],
                    'IMLValues': self.job_ctxt.imls,
                    'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'],
                    'PoEValues': value}
                hc_attrib.update(hc_meta)
                hc_data.append((site, hc_attrib))
                accounted_for.add(site)
                results_found += 1
            if not results_found:
                # No results found, increase the sleep pause.
                pause = pgen.next()
            else:
                curve_writer.serialize(hc_data)
                pause *= 0.8
                pause = min_pause if pause < min_pause else pause
            logs.log_percent_complete(self.job_ctxt.job_id, "hazard")

        return nrml_path
Example #8
0
    def serialize_hazard_curve(self, nrml_file, key_template, hc_attrib_update,
                               sites):
        """
        Serialize the hazard curves of a set of sites.

        Depending on the parameters the serialized curve will be a plain, mean
        or quantile hazard curve.

        :param nrml_file: the output filename
        :type nrml_file: :py:class:`string`
        :param key_template: a template for constructing the key to get, for
                             each site, its curve from the KVS
        :type key_template: :py:class:`string`
        :param hc_attrib_update: a dictionary containing metadata for the set
                                 of curves that will be serialized
        :type hc_attrib_update: :py:class:`dict`
        :param sites: the sites of which the curve will be serialized
        :type sites: list of :py:class:`openquake.shapes.Site`
        """

        def pause_generator(value):
            """
            Returns the initial value when called for the first time and
            the double value upon each subsequent invocation.

            N.B.: the maximum value returned will never exceed 90 (seconds).
            """
            yield value
            while True:
                if value < 45:
                    value *= 2
                yield value

        # XML serialization context
        xsc = namedtuple("XSC", "blocks, cblock, i_total, i_done, i_next")(
                         stats.pk_get(self.job_ctxt.job_id, "blocks"),
                         stats.pk_get(self.job_ctxt.job_id, "cblock"),
                         len(sites), 0, 0)

        nrml_path = self.job_ctxt.build_nrml_path(nrml_file)

        curve_writer = hazard_output.create_hazardcurve_writer(
            self.job_ctxt.job_id, self.job_ctxt.serialize_results_to,
            nrml_path)

        sites = set(sites)
        accounted_for = set()
        min_pause = 0.1
        pgen = pause_generator(min_pause)
        pause = pgen.next()

        while accounted_for != sites:
            failures = stats.failure_counters(self.job_ctxt.job_id, "h")
            if failures:
                raise RuntimeError("hazard failures (%s), aborting" % failures)
            hc_data = []
            # Sleep a little before checking the availability of additional
            # hazard curve results.
            time.sleep(pause)
            results_found = 0
            for site in sites:
                if site in accounted_for:
                    continue
                value = kvs.get_value_json_decoded(key_template % hash(site))
                if value is None:
                    # No value yet, proceed to next site.
                    continue
                # Use hazard curve ordinate values (PoE) from KVS and abscissae
                # from the IML list in config.
                hc_attrib = {
                    'investigationTimeSpan':
                        self.job_ctxt['INVESTIGATION_TIME'],
                    'IMLValues': self.job_ctxt.imls,
                    'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'],
                    'PoEValues': value}
                hc_attrib.update(hc_attrib_update)
                hc_data.append((site, hc_attrib))
                accounted_for.add(site)
                results_found += 1
            if not results_found:
                # No results found, increase the sleep pause.
                pause = pgen.next()
            else:
                hazard_output.SerializerContext().update(
                    xsc._replace(i_next=len(hc_data)))
                curve_writer.serialize(hc_data)
                xsc = xsc._replace(i_done=xsc.i_done + len(hc_data))
                pause *= 0.8
                pause = min_pause if pause < min_pause else pause

        return nrml_path
Example #9
0
 def test_failure_counters_with_no_failures(self):
     # An empty list is returned in the absence of any failure counters
     stats.delete_job_counters(123)
     self.assertEqual([], stats.failure_counters(123))
Example #10
0
    def ath(self, sites, rtype, datum=None):
        """
        Write calculation results to the database.

        :param sites: the sites for which to write calculation results.
        :type sites: list of :py:class:`openquake.shapes.Site`
        :param str rtype: hazard curve type, one of: curve, mean, quantile
        :param datum: one of: realization, None, quantile
        """
        def pause_generator(value):
            """
            Returns the initial value when called for the first time and
            the double value upon each subsequent invocation.

            N.B.: the maximum value returned will never exceed 90 (seconds).
            """
            yield value
            while True:
                if value < 45:
                    value *= 2
                yield value

        sites = set(sites)
        accounted_for = set()
        min_pause = 0.1
        pgen = pause_generator(min_pause)
        pause = pgen.next()

        key_template, nrml_path, hc_meta = psha_exp.hcs_meta(
            self.job_ctxt, rtype, datum)

        curve_writer = hazard_output.HazardCurveDBWriter(
            nrml_path, self.job_ctxt.job_id)

        while accounted_for != sites:
            failures = stats.failure_counters(self.job_ctxt.job_id, "h")
            if failures:
                raise RuntimeError("hazard failures (%s), aborting" % failures)
            hc_data = []
            # Sleep a little before checking the availability of additional
            # hazard curve results.
            time.sleep(pause)
            results_found = 0
            for site in sites:
                if site in accounted_for:
                    continue
                value = kvs.get_value_json_decoded(key_template % hash(site))
                if value is None:
                    # No value yet, proceed to next site.
                    continue
                # Use hazard curve ordinate values (PoE) from KVS and abscissae
                # from the IML list in config.
                hc_attrib = {
                    'investigationTimeSpan':
                    self.job_ctxt['INVESTIGATION_TIME'],
                    'IMLValues': self.job_ctxt.imls,
                    'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'],
                    'PoEValues': value
                }
                hc_attrib.update(hc_meta)
                hc_data.append((site, hc_attrib))
                accounted_for.add(site)
                results_found += 1
            if not results_found:
                # No results found, increase the sleep pause.
                pause = pgen.next()
            else:
                curve_writer.serialize(hc_data)
                pause *= 0.8
                pause = min_pause if pause < min_pause else pause
            logs.log_percent_complete(self.job_ctxt.job_id, "hazard")

        return nrml_path
Example #11
0
    def serialize_hazard_curve(self, nrml_file, key_template, hc_attrib_update,
                               sites):
        """
        Serialize the hazard curves of a set of sites.

        Depending on the parameters the serialized curve will be a plain, mean
        or quantile hazard curve.

        :param nrml_file: the output filename
        :type nrml_file: :py:class:`string`
        :param key_template: a template for constructing the key to get, for
                             each site, its curve from the KVS
        :type key_template: :py:class:`string`
        :param hc_attrib_update: a dictionary containing metadata for the set
                                 of curves that will be serialized
        :type hc_attrib_update: :py:class:`dict`
        :param sites: the sites of which the curve will be serialized
        :type sites: list of :py:class:`openquake.shapes.Site`
        """
        def pause_generator(value):
            """
            Returns the initial value when called for the first time and
            the double value upon each subsequent invocation.

            N.B.: the maximum value returned will never exceed 90 (seconds).
            """
            yield value
            while True:
                if value < 45:
                    value *= 2
                yield value

        # XML serialization context
        xsc = namedtuple("XSC", "blocks, cblock, i_total, i_done, i_next")(
            stats.pk_get(self.job_ctxt.job_id, "blocks"),
            stats.pk_get(self.job_ctxt.job_id, "cblock"), len(sites), 0, 0)

        nrml_path = self.job_ctxt.build_nrml_path(nrml_file)

        curve_writer = hazard_output.create_hazardcurve_writer(
            self.job_ctxt.job_id, self.job_ctxt.serialize_results_to,
            nrml_path)

        sites = set(sites)
        accounted_for = set()
        min_pause = 0.1
        pgen = pause_generator(min_pause)
        pause = pgen.next()

        while accounted_for != sites:
            failures = stats.failure_counters(self.job_ctxt.job_id, "h")
            if failures:
                raise RuntimeError("hazard failures (%s), aborting" % failures)
            hc_data = []
            # Sleep a little before checking the availability of additional
            # hazard curve results.
            time.sleep(pause)
            results_found = 0
            for site in sites:
                if site in accounted_for:
                    continue
                value = kvs.get_value_json_decoded(key_template % hash(site))
                if value is None:
                    # No value yet, proceed to next site.
                    continue
                # Use hazard curve ordinate values (PoE) from KVS and abscissae
                # from the IML list in config.
                hc_attrib = {
                    'investigationTimeSpan':
                    self.job_ctxt['INVESTIGATION_TIME'],
                    'IMLValues': self.job_ctxt.imls,
                    'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'],
                    'PoEValues': value
                }
                hc_attrib.update(hc_attrib_update)
                hc_data.append((site, hc_attrib))
                accounted_for.add(site)
                results_found += 1
            if not results_found:
                # No results found, increase the sleep pause.
                pause = pgen.next()
            else:
                hazard_output.SerializerContext().update(
                    xsc._replace(i_next=len(hc_data)))
                curve_writer.serialize(hc_data)
                xsc = xsc._replace(i_done=xsc.i_done + len(hc_data))
                pause *= 0.8
                pause = min_pause if pause < min_pause else pause

        return nrml_path
Example #12
0
 def test_failure_counters_with_no_failures(self):
     # An empty list is returned in the absence of any failure counters
     stats.delete_job_counters(123)
     self.assertEqual([], stats.failure_counters(123))