def test_failure_counters_with_valid_area(self): # Failure counters are returned for valid computation areas. stats.delete_job_counters(123) fcname = itertools.cycle(string.ascii_lowercase) for cidx, carea in enumerate(["g", "h", "r"]): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) if not (cidx % 2): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) self.assertEqual([('oqs/123/g/a:failed/i', 1), ('oqs/123/g/b:failed/i', 1)], sorted(stats.failure_counters(123, "g"))) self.assertEqual([('oqs/123/h/c:failed/i', 1)], sorted(stats.failure_counters(123, "h"))) self.assertEqual([('oqs/123/r/d:failed/i', 1), ('oqs/123/r/e:failed/i', 1)], sorted(stats.failure_counters(123, "r")))
def test_failure_counters_with_valid_area(self): # Failure counters are returned for valid computation areas. stats.delete_job_counters(123) fcname = itertools.cycle(string.ascii_lowercase) for cidx, carea in enumerate(["g", "h", "r"]): stats.incr_counter(123, carea, "%s-failures" % fcname.next()) if not (cidx % 2): stats.incr_counter(123, carea, "%s-failures" % fcname.next()) self.assertEqual( [('oqs/123/g/a-failures/i', 1), ('oqs/123/g/b-failures/i', 1)], sorted(stats.failure_counters(123, "g"))) self.assertEqual([('oqs/123/h/c-failures/i', 1)], sorted(stats.failure_counters(123, "h"))) self.assertEqual( [('oqs/123/r/d-failures/i', 1), ('oqs/123/r/e-failures/i', 1)], sorted(stats.failure_counters(123, "r")))
def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) if failures: message = "job terminated with failures: %s" % failures else: failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'complete': message = 'job process %s succeeded' % self.job_pid self.selflogger.debug(message) elif not job_status == 'complete': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status_and_error_msg(self.job_id, error_msg=message) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id) raise StopIteration()
def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) if failures: message = "job terminated with failures: %s" % failures else: failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'succeeded': message = 'job process %s succeeded' % self.job_pid self.selflogger.info(message) elif job_status == 'running': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status_and_error_msg(self.job_id, 'failed', message) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id) raise StopIteration()
def test_failure_counters_with_no_area(self): # Failure counters are returned for all computation areas if the # 'area' parameter is omitted. stats.delete_job_counters(123) fcname = itertools.cycle(string.ascii_lowercase) for cidx, carea in enumerate(["g", "h", "r"]): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) if not (cidx % 2): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) self.assertEqual( [('oqs/123/g/a:failed/i', 1), ('oqs/123/g/b:failed/i', 1), ('oqs/123/h/c:failed/i', 1), ('oqs/123/r/d:failed/i', 1), ('oqs/123/r/e:failed/i', 1)], sorted(stats.failure_counters(123)))
def test_failure_counters_with_no_area(self): # Failure counters are returned for all computation areas if the # 'area' parameter is omitted. stats.delete_job_counters(123) fcname = itertools.cycle(string.ascii_lowercase) for cidx, carea in enumerate(["g", "h", "r"]): stats.incr_counter(123, carea, "%s-failures" % fcname.next()) if not (cidx % 2): stats.incr_counter(123, carea, "%s-failures" % fcname.next()) self.assertEqual([('oqs/123/g/a-failures/i', 1), ('oqs/123/g/b-failures/i', 1), ('oqs/123/h/c-failures/i', 1), ('oqs/123/r/d-failures/i', 1), ('oqs/123/r/e-failures/i', 1)], sorted(stats.failure_counters(123)))
def ath(self, sites, rtype, datum=None): """ Write calculation results to the database. :param sites: the sites for which to write calculation results. :type sites: list of :py:class:`openquake.shapes.Site` :param str rtype: hazard curve type, one of: curve, mean, quantile :param datum: one of: realization, None, quantile """ def pause_generator(value): """ Returns the initial value when called for the first time and the double value upon each subsequent invocation. N.B.: the maximum value returned will never exceed 90 (seconds). """ yield value while True: if value < 45: value *= 2 yield value sites = set(sites) accounted_for = set() min_pause = 0.1 pgen = pause_generator(min_pause) pause = pgen.next() key_template, nrml_path, hc_meta = psha_exp.hcs_meta( self.job_ctxt, rtype, datum) curve_writer = hazard_output.HazardCurveDBWriter( nrml_path, self.job_ctxt.job_id) while accounted_for != sites: failures = stats.failure_counters(self.job_ctxt.job_id, "h") if failures: raise RuntimeError("hazard failures (%s), aborting" % failures) hc_data = [] # Sleep a little before checking the availability of additional # hazard curve results. time.sleep(pause) results_found = 0 for site in sites: if site in accounted_for: continue value = kvs.get_value_json_decoded(key_template % hash(site)) if value is None: # No value yet, proceed to next site. continue # Use hazard curve ordinate values (PoE) from KVS and abscissae # from the IML list in config. hc_attrib = { 'investigationTimeSpan': self.job_ctxt['INVESTIGATION_TIME'], 'IMLValues': self.job_ctxt.imls, 'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'], 'PoEValues': value} hc_attrib.update(hc_meta) hc_data.append((site, hc_attrib)) accounted_for.add(site) results_found += 1 if not results_found: # No results found, increase the sleep pause. pause = pgen.next() else: curve_writer.serialize(hc_data) pause *= 0.8 pause = min_pause if pause < min_pause else pause logs.log_percent_complete(self.job_ctxt.job_id, "hazard") return nrml_path
def serialize_hazard_curve(self, nrml_file, key_template, hc_attrib_update, sites): """ Serialize the hazard curves of a set of sites. Depending on the parameters the serialized curve will be a plain, mean or quantile hazard curve. :param nrml_file: the output filename :type nrml_file: :py:class:`string` :param key_template: a template for constructing the key to get, for each site, its curve from the KVS :type key_template: :py:class:`string` :param hc_attrib_update: a dictionary containing metadata for the set of curves that will be serialized :type hc_attrib_update: :py:class:`dict` :param sites: the sites of which the curve will be serialized :type sites: list of :py:class:`openquake.shapes.Site` """ def pause_generator(value): """ Returns the initial value when called for the first time and the double value upon each subsequent invocation. N.B.: the maximum value returned will never exceed 90 (seconds). """ yield value while True: if value < 45: value *= 2 yield value # XML serialization context xsc = namedtuple("XSC", "blocks, cblock, i_total, i_done, i_next")( stats.pk_get(self.job_ctxt.job_id, "blocks"), stats.pk_get(self.job_ctxt.job_id, "cblock"), len(sites), 0, 0) nrml_path = self.job_ctxt.build_nrml_path(nrml_file) curve_writer = hazard_output.create_hazardcurve_writer( self.job_ctxt.job_id, self.job_ctxt.serialize_results_to, nrml_path) sites = set(sites) accounted_for = set() min_pause = 0.1 pgen = pause_generator(min_pause) pause = pgen.next() while accounted_for != sites: failures = stats.failure_counters(self.job_ctxt.job_id, "h") if failures: raise RuntimeError("hazard failures (%s), aborting" % failures) hc_data = [] # Sleep a little before checking the availability of additional # hazard curve results. time.sleep(pause) results_found = 0 for site in sites: if site in accounted_for: continue value = kvs.get_value_json_decoded(key_template % hash(site)) if value is None: # No value yet, proceed to next site. continue # Use hazard curve ordinate values (PoE) from KVS and abscissae # from the IML list in config. hc_attrib = { 'investigationTimeSpan': self.job_ctxt['INVESTIGATION_TIME'], 'IMLValues': self.job_ctxt.imls, 'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'], 'PoEValues': value} hc_attrib.update(hc_attrib_update) hc_data.append((site, hc_attrib)) accounted_for.add(site) results_found += 1 if not results_found: # No results found, increase the sleep pause. pause = pgen.next() else: hazard_output.SerializerContext().update( xsc._replace(i_next=len(hc_data))) curve_writer.serialize(hc_data) xsc = xsc._replace(i_done=xsc.i_done + len(hc_data)) pause *= 0.8 pause = min_pause if pause < min_pause else pause return nrml_path
def test_failure_counters_with_no_failures(self): # An empty list is returned in the absence of any failure counters stats.delete_job_counters(123) self.assertEqual([], stats.failure_counters(123))
def ath(self, sites, rtype, datum=None): """ Write calculation results to the database. :param sites: the sites for which to write calculation results. :type sites: list of :py:class:`openquake.shapes.Site` :param str rtype: hazard curve type, one of: curve, mean, quantile :param datum: one of: realization, None, quantile """ def pause_generator(value): """ Returns the initial value when called for the first time and the double value upon each subsequent invocation. N.B.: the maximum value returned will never exceed 90 (seconds). """ yield value while True: if value < 45: value *= 2 yield value sites = set(sites) accounted_for = set() min_pause = 0.1 pgen = pause_generator(min_pause) pause = pgen.next() key_template, nrml_path, hc_meta = psha_exp.hcs_meta( self.job_ctxt, rtype, datum) curve_writer = hazard_output.HazardCurveDBWriter( nrml_path, self.job_ctxt.job_id) while accounted_for != sites: failures = stats.failure_counters(self.job_ctxt.job_id, "h") if failures: raise RuntimeError("hazard failures (%s), aborting" % failures) hc_data = [] # Sleep a little before checking the availability of additional # hazard curve results. time.sleep(pause) results_found = 0 for site in sites: if site in accounted_for: continue value = kvs.get_value_json_decoded(key_template % hash(site)) if value is None: # No value yet, proceed to next site. continue # Use hazard curve ordinate values (PoE) from KVS and abscissae # from the IML list in config. hc_attrib = { 'investigationTimeSpan': self.job_ctxt['INVESTIGATION_TIME'], 'IMLValues': self.job_ctxt.imls, 'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'], 'PoEValues': value } hc_attrib.update(hc_meta) hc_data.append((site, hc_attrib)) accounted_for.add(site) results_found += 1 if not results_found: # No results found, increase the sleep pause. pause = pgen.next() else: curve_writer.serialize(hc_data) pause *= 0.8 pause = min_pause if pause < min_pause else pause logs.log_percent_complete(self.job_ctxt.job_id, "hazard") return nrml_path
def serialize_hazard_curve(self, nrml_file, key_template, hc_attrib_update, sites): """ Serialize the hazard curves of a set of sites. Depending on the parameters the serialized curve will be a plain, mean or quantile hazard curve. :param nrml_file: the output filename :type nrml_file: :py:class:`string` :param key_template: a template for constructing the key to get, for each site, its curve from the KVS :type key_template: :py:class:`string` :param hc_attrib_update: a dictionary containing metadata for the set of curves that will be serialized :type hc_attrib_update: :py:class:`dict` :param sites: the sites of which the curve will be serialized :type sites: list of :py:class:`openquake.shapes.Site` """ def pause_generator(value): """ Returns the initial value when called for the first time and the double value upon each subsequent invocation. N.B.: the maximum value returned will never exceed 90 (seconds). """ yield value while True: if value < 45: value *= 2 yield value # XML serialization context xsc = namedtuple("XSC", "blocks, cblock, i_total, i_done, i_next")( stats.pk_get(self.job_ctxt.job_id, "blocks"), stats.pk_get(self.job_ctxt.job_id, "cblock"), len(sites), 0, 0) nrml_path = self.job_ctxt.build_nrml_path(nrml_file) curve_writer = hazard_output.create_hazardcurve_writer( self.job_ctxt.job_id, self.job_ctxt.serialize_results_to, nrml_path) sites = set(sites) accounted_for = set() min_pause = 0.1 pgen = pause_generator(min_pause) pause = pgen.next() while accounted_for != sites: failures = stats.failure_counters(self.job_ctxt.job_id, "h") if failures: raise RuntimeError("hazard failures (%s), aborting" % failures) hc_data = [] # Sleep a little before checking the availability of additional # hazard curve results. time.sleep(pause) results_found = 0 for site in sites: if site in accounted_for: continue value = kvs.get_value_json_decoded(key_template % hash(site)) if value is None: # No value yet, proceed to next site. continue # Use hazard curve ordinate values (PoE) from KVS and abscissae # from the IML list in config. hc_attrib = { 'investigationTimeSpan': self.job_ctxt['INVESTIGATION_TIME'], 'IMLValues': self.job_ctxt.imls, 'IMT': self.job_ctxt['INTENSITY_MEASURE_TYPE'], 'PoEValues': value } hc_attrib.update(hc_attrib_update) hc_data.append((site, hc_attrib)) accounted_for.add(site) results_found += 1 if not results_found: # No results found, increase the sleep pause. pause = pgen.next() else: hazard_output.SerializerContext().update( xsc._replace(i_next=len(hc_data))) curve_writer.serialize(hc_data) xsc = xsc._replace(i_done=xsc.i_done + len(hc_data)) pause *= 0.8 pause = min_pause if pause < min_pause else pause return nrml_path