Ejemplo n.º 1
0
 def connect(self, *args, **kwargs):
     host = config.get("kvs", "host")
     port = config.get("kvs", "port")
     port = int(port) if port else 6379
     stats_db = config.get("kvs", "stats_db")
     stats_db = int(stats_db) if stats_db else 15
     args = {"host": host, "port": port, "db": stats_db}
     return redis.Redis(**args)
Ejemplo n.º 2
0
def _redis():
    """Return a connection to the redis store."""
    host = config.get("kvs", "host")
    port = config.get("kvs", "port")
    port = int(port) if port else 6379
    stats_db = config.get("kvs", "stats_db")
    stats_db = int(stats_db) if stats_db else DEFAULT_STATS_DB
    args = {"host": host, "port": port, "db": stats_db}
    return redis.Redis(**args)
Ejemplo n.º 3
0
def _redis():
    """Return a connection to the redis store."""
    host = config.get("kvs", "host")
    port = config.get("kvs", "port")
    port = int(port) if port else 6379
    stats_db = config.get("kvs", "stats_db")
    stats_db = int(stats_db) if stats_db else DEFAULT_STATS_DB
    args = {"host": host, "port": port, "db": stats_db}
    return redis.Redis(**args)
Ejemplo n.º 4
0
 def __init__(self, job, monitor=None):
     self.job = job
     self.oqparam = self.job.get_oqparam()
     self.monitor = monitor or EnginePerformanceMonitor('', job.id)
     self.num_tasks = None
     self._task_args = []
     # parameters from openquake.cfg
     self.concurrent_tasks = int(config.get('celery', 'concurrent_tasks'))
     self.max_input_weight = float(config.get('hazard', 'max_input_weight'))
     self.max_output_weight = float(
         config.get('hazard', 'max_output_weight'))
     TrtModel.POINT_SOURCE_WEIGHT = float(
         config.get('hazard', 'point_source_weight'))
Ejemplo n.º 5
0
    def __init__(self, job):
        super(BaseHazardCalculator, self).__init__(job)

        # three crucial parameters from openquake.cfg
        self.source_max_weight = int(
            config.get('hazard', 'source_max_weight'))
        self.concurrent_tasks = int(
            config.get('hazard', 'concurrent_tasks'))

        # a dictionary trt_model_id -> num_ruptures
        self.num_ruptures = collections.defaultdict(int)
        # now a dictionary (trt_model_id, gsim) -> poes
        self.curves = {}
Ejemplo n.º 6
0
 def __init__(self, job):
     self.job = job
     self.num_tasks = None
     self._task_args = []
     # parameters from openquake.cfg
     self.concurrent_tasks = int(
         config.get('celery', 'concurrent_tasks'))
     self.max_input_weight = float(
         config.get('hazard', 'max_input_weight'))
     self.max_output_weight = float(
         config.get('hazard', 'max_output_weight'))
     SourceCollector.POINT_SOURCE_WEIGHT = float(
         config.get('hazard', 'point_source_weight'))
Ejemplo n.º 7
0
 def __init__(self, job, monitor=None):
     self.job = job
     self.oqparam = self.job.get_oqparam()
     self.monitor = monitor or EnginePerformanceMonitor('', job.id)
     self.num_tasks = None
     self._task_args = []
     # parameters from openquake.cfg
     self.concurrent_tasks = self.oqparam.concurrent_tasks
     self.max_input_weight = float(
         config.get('hazard', 'max_input_weight'))
     self.max_output_weight = float(
         config.get('hazard', 'max_output_weight'))
     TrtModel.POINT_SOURCE_WEIGHT = float(
         config.get('hazard', 'point_source_weight'))
Ejemplo n.º 8
0
 def test_get_with_unknown_key(self):
     """config.get() returns `None` if the `key` is not known."""
     with patch('openquake.engine.utils.config.get_section') as mock:
         mock.return_value = dict(b=1)
         self.assertTrue(config.get("arghh", "c") is None)
         self.assertEqual(1, mock.call_count)
         self.assertEqual([("arghh", ), {}], mock.call_args)
Ejemplo n.º 9
0
    def record_init_stats(self):
        """
        Record some basic job stats, including the number of sites,
        realizations (end branches), and total number of tasks for the job.

        This should be run between the `pre-execute` and `execute` phases, once
        the job has been fully initialized.
        """
        # Record num sites, num realizations, and num tasks.
        num_sites = len(self.computation_mesh)
        realizations = models.LtRealization.objects.filter(hazard_calculation=self.hc.id)
        num_rlzs = realizations.count()

        # Compute the number of tasks.
        block_size = int(config.get("hazard", "block_size"))
        num_tasks = 0
        for lt_rlz in realizations:
            # Each realization has the potential to choose a random source
            # model, and thus there may be a variable number of tasks for each
            # realization (depending on the number of the sources in the model
            # which was chosen for the realization).
            num_sources = models.SourceProgress.objects.filter(lt_realization=lt_rlz).count()
            num_tasks += math.ceil(float(num_sources) / block_size)

        [job_stats] = models.JobStats.objects.filter(oq_job=self.job.id)
        job_stats.num_sites = num_sites
        job_stats.num_tasks = num_tasks
        job_stats.num_realizations = num_rlzs
        job_stats.save()
Ejemplo n.º 10
0
    def initialize_sources(self):
        """
        Parse source models and validate source logic trees. It also
        filters the sources far away and apply uncertainties to the
        relevant ones. As a side effect it populates the instance dictionary
        `.source_blocks_per_ltpath`. Notice that sources are automatically
        split.

        :returns:
            a list with the number of sources for each source model
        """
        logs.LOG.progress("initializing sources")
        smlt_file = self.hc.inputs['source_model_logic_tree']
        self.smlt = logictree.SourceModelLogicTree(
            file(smlt_file).read(), self.hc.base_path, smlt_file)
        sm_paths = list(self.smlt.get_sm_paths())
        nblocks = ceil(config.get('hazard', 'concurrent_tasks'), len(sm_paths))

        # here we are doing a full enumeration of the source model logic tree;
        # this is not bad since for very large source models there are
        # typically very few realizations; moreover, the filtering will remove
        # most of the sources, so the memory occupation is typically low
        lt_models = []
        for i, (sm, path) in enumerate(sm_paths):
            smpath = tuple(path)
            fname = os.path.join(self.hc.base_path, sm)
            source_collector = source.parse_source_model_smart(
                fname,
                self.hc.sites_affected_by,
                self.smlt.make_apply_uncertainties(path),
                self.hc)
            if not source_collector.source_weights:
                raise RuntimeError(
                    'Could not find sources close to the sites in %s '
                    '(maximum_distance=%s km)' %
                    (fname, self.hc.maximum_distance))

            lt_model = models.LtSourceModel.objects.create(
                hazard_calculation=self.hc, ordinal=i, sm_lt_path=smpath)
            lt_models.append(lt_model)
            for trt, blocks in source_collector.split_blocks(nblocks):
                self.source_blocks_per_ltpath[smpath, trt] = blocks
                n = sum(len(block) for block in blocks)
                logs.LOG.info('Found %d relevant source(s) for %s %s, TRT=%s',
                              n, sm, path, trt)
                logs.LOG.info('Splitting in %d blocks', len(blocks))
                for i, block in enumerate(blocks, 1):
                    logs.LOG.debug('%s, block %d: %d source(s), weight %s',
                                   trt, i, len(block), block.weight)

            # save LtModelInfo objects for each tectonic region type
            for trt in source_collector.sorted_trts():
                models.LtModelInfo.objects.create(
                    lt_model=lt_model,
                    tectonic_region_type=trt,
                    num_sources=len(source_collector.source_weights[trt]),
                    num_ruptures=source_collector.num_ruptures[trt],
                    min_mag=source_collector.min_mag[trt],
                    max_mag=source_collector.max_mag[trt])
        return lt_models
Ejemplo n.º 11
0
 def test_get_with_empty_section_data(self):
     # config.get() returns `None` if the section data dict is empty
     with patch('openquake.engine.utils.config.get_section') as mock:
         mock.return_value = dict()
         self.assertTrue(config.get("whatever", "key") is None)
         self.assertEqual(1, mock.call_count)
         self.assertEqual([("whatever",), {}], mock.call_args)
Ejemplo n.º 12
0
 def initialize_sources(self):
     """
     Parse source models, apply uncertainties and validate source logic
     trees. Save in the database LtSourceModel and TrtModel objects.
     """
     logs.LOG.progress("initializing sources")
     parallel_source_splitting = valid.boolean(
         config.get('hazard', 'parallel_source_splitting') or 'false')
     self.composite_model = readinput.get_composite_source_model(
         self.oqparam, self.site_collection,
         no_distribute=not parallel_source_splitting)
     for sm in self.composite_model:
         # create an LtSourceModel for each distinct source model
         lt_model = models.LtSourceModel.objects.create(
             hazard_calculation=self.job,
             sm_lt_path=self.tilepath + sm.path,
             ordinal=sm.ordinal, sm_name=sm.name, weight=sm.weight,
             samples=sm.samples)
         self._source_models.append(lt_model)
         gsims_by_trt = sm.gsim_lt.values
         # save TrtModels for each tectonic region type
         # and stored the db ID in the in-memory models
         for trt_mod in sm.trt_models:
             trt_mod.id = models.TrtModel.objects.create(
                 lt_model=lt_model,
                 tectonic_region_type=trt_mod.trt,
                 num_sources=len(trt_mod),
                 num_ruptures=trt_mod.num_ruptures,
                 min_mag=trt_mod.min_mag,
                 max_mag=trt_mod.max_mag,
                 gsims=gsims_by_trt[trt_mod.trt]).id
     # rebuild the info object with the trt_ids coming from the db
     self.composite_model.info = source.CompositionInfo(
         self.composite_model.source_model_lt,
         self.composite_model.source_models)
Ejemplo n.º 13
0
 def test_get_with_empty_section_data(self):
     """config.get() returns `None` if the section data dict is empty."""
     with patch('openquake.engine.utils.config.get_section') as mock:
         mock.return_value = dict()
         self.assertTrue(config.get("whatever", "key") is None)
         self.assertEqual(1, mock.call_count)
         self.assertEqual([("whatever", ), {}], mock.call_args)
Ejemplo n.º 14
0
 def test_get_with_unknown_key(self):
     """config.get() returns `None` if the `key` is not known."""
     with patch('openquake.engine.utils.config.get_section') as mock:
         mock.return_value = dict(b=1)
         self.assertTrue(config.get("arghh", "c") is None)
         self.assertEqual(1, mock.call_count)
         self.assertEqual([("arghh",), {}], mock.call_args)
Ejemplo n.º 15
0
 def test_get_with_nonempty_section_data_and_known_key(self):
     # config.get() correctly returns the configuration datum for known
     # sections/keys
     with patch('openquake.engine.utils.config.get_section') as mock:
         mock.return_value = dict(a=11)
         self.assertEqual(11, config.get("hmmm", "a"))
         self.assertEqual(1, mock.call_count)
         self.assertEqual([("hmmm", ), {}], mock.call_args)
Ejemplo n.º 16
0
 def test_get_with_nonempty_section_data_and_known_key(self):
     # config.get() correctly returns the configuration datum for known
     # sections/keys
     with patch('openquake.engine.utils.config.get_section') as mock:
         mock.return_value = dict(a=11)
         self.assertEqual(11, config.get("hmmm", "a"))
         self.assertEqual(1, mock.call_count)
         self.assertEqual([("hmmm",), {}], mock.call_args)
Ejemplo n.º 17
0
Archivo: core.py Proyecto: 4x/oq-engine
    def pre_execute(self):
        """
        Do pre-execution work. At the moment, this work entails: parsing and
        initializing sources, parsing and initializing the site model (if there
        is one), and generating logic tree realizations. (The latter piece
        basically defines the work to be done in the `execute` phase.)
        """
        # Parse logic trees and create source Inputs.
        self.initialize_sources()

        # Deal with the site model and compute site data for the calculation
        # (if a site model was specified, that is).
        self.initialize_site_model()

        # Once the site model is init'd, create and cache the site collection;
        self.hc.init_site_collection()

        # Now bootstrap the logic tree realizations and related data.
        # This defines for us the "work" that needs to be done when we reach
        # the `execute` phase.
        # This will also stub out hazard curve result records. Workers will
        # update these periodically with partial results (partial meaning,
        # result curves for just a subset of the overall sources) when some
        # work is complete.
        self.initialize_realizations(
            rlz_callbacks=[self.initialize_hazard_curve_progress])

        self.record_init_stats()

        # Set the progress counters:
        num_sources = models.SourceProgress.objects.filter(
            is_complete=False,
            lt_realization__hazard_calculation=self.hc).count()
        self.progress['total'] += num_sources
        self.progress['hc_total'] = num_sources

        realizations = models.LtRealization.objects.filter(
            hazard_calculation=self.hc, is_complete=False)
        num_rlzs = realizations.count()
        num_points = len(self.hc.points_to_compute())
        self.progress['total'] += num_rlzs * num_points

        # Update stats to consider the disagg tasks as well:
        [job_stats] = models.JobStats.objects.filter(oq_job=self.job.id)
        block_size = int(config.get('hazard', 'block_size'))
        job_stats.num_tasks += int(
            math.ceil(float(num_points) * num_rlzs / block_size)
        )
        job_stats.save()

        # Update the progress info on the realizations, to include the disagg
        # phase:
        for rlz in realizations:
            rlz.total_items += num_points
            rlz.save()

        self.initialize_pr_data()
Ejemplo n.º 18
0
def oqtask(task_func):
    """
    Task function decorator which sets up logging and catches (and logs) any
    errors which occur inside the task. Also checks to make sure the job is
    actually still running. If it is not running, the task doesn't get
    executed, so we don't do useless computation.
    """

    @wraps(task_func)
    def wrapped(*args):
        """
        Initialize logs, make sure the job is still running, and run the task
        code surrounded by a try-except. If any error occurs, log it as a
        critical failure.
        """
        # job_id is always assumed to be the first argument
        job_id = args[0]
        job = models.OqJob.objects.get(id=job_id)
        if job.is_running is False:
            # the job was killed, it is useless to run the task
            return

        # it is important to save the task id soon, so that
        # the revoke functionality can work
        EnginePerformanceMonitor.store_task_id(job_id, tsk)

        with EnginePerformanceMonitor(
                'total ' + task_func.__name__, job_id, tsk, flush=True):

            with EnginePerformanceMonitor(
                    'loading calculation object', job_id, tsk, flush=True):
                calculation = job.calculation

            # tasks write on the celery log file
            logs.init_logs(
                level=job.log_level,
                calc_domain='hazard' if isinstance(
                    calculation, models.HazardCalculation) else'risk',
                calc_id=calculation.id)
            try:
                return task_func(*args), None
            except:
                etype, exc, tb = sys.exc_info()
                tb_str = ''.join(traceback.format_tb(tb))
                return '%s\n%s' % (exc, tb_str), etype
            finally:
                CacheInserter.flushall()
                # the task finished, we can remove from the performance
                # table the associated row 'storing task id'
                models.Performance.objects.filter(
                    oq_job=job,
                    operation='storing task id',
                    task_id=tsk.request.id).delete()
    celery_queue = config.get('amqp', 'celery_queue')
    tsk = task(wrapped, queue=celery_queue)
    tsk.task_func = task_func
    return tsk
Ejemplo n.º 19
0
def oqtask(task_func):
    """
    Task function decorator which sets up logging and catches (and logs) any
    errors which occur inside the task. Also checks to make sure the job is
    actually still running. If it is not running, the task doesn't get
    executed, so we don't do useless computation.

    :param task_func: the function to decorate
    """

    def wrapped(*args):
        """
        Initialize logs, make sure the job is still running, and run the task
        code surrounded by a try-except. If any error occurs, log it as a
        critical failure.
        """
        # the last argument is assumed to be a monitor
        monitor = args[-1]
        job = models.OqJob.objects.get(id=monitor.job_id)
        if job.is_running is False:
            # the job was killed, it is useless to run the task
            raise JobNotRunning(monitor.job_id)

        # it is important to save the task id soon, so that
        # the revoke functionality can work
        with monitor("storing task id", task=tsk, autoflush=True):
            pass

        with logs.handle(job):
            check_mem_usage()  # warn if too much memory is used
            # run the task
            try:
                total = "total " + task_func.__name__
                with monitor(total, task=tsk):
                    with GroundShakingIntensityModel.forbid_instantiation():
                        return task_func(*args)
            finally:
                # save on the db
                CacheInserter.flushall()
                # the task finished, we can remove from the performance
                # table the associated row 'storing task id'
                models.Performance.objects.filter(
                    oq_job=job, operation="storing task id", task_id=tsk.request.id
                ).delete()

    celery_queue = config.get("amqp", "celery_queue")
    f = lambda *args: safely_call(wrapped, args, pickle=True)
    f.__name__ = task_func.__name__
    f.__module__ = task_func.__module__
    tsk = task(f, queue=celery_queue)
    tsk.__func__ = tsk
    tsk.task_func = task_func
    return tsk
Ejemplo n.º 20
0
def oqtask(task_func):
    """
    Task function decorator which sets up logging and catches (and logs) any
    errors which occur inside the task. Also checks to make sure the job is
    actually still running. If it is not running, the task doesn't get
    executed, so we don't do useless computation.

    :param task_func: the function to decorate
    """
    def wrapped(*args):
        """
        Initialize logs, make sure the job is still running, and run the task
        code surrounded by a try-except. If any error occurs, log it as a
        critical failure.
        """
        # the last argument is assumed to be a monitor
        monitor = args[-1]
        job = models.OqJob.objects.get(id=monitor.job_id)
        if job.is_running is False:
            # the job was killed, it is useless to run the task
            raise JobNotRunning(monitor.job_id)

        # it is important to save the task id soon, so that
        # the revoke functionality can work
        with monitor('storing task id', task=tsk, autoflush=True):
            pass

        with logs.handle(job):
            check_mem_usage()  # warn if too much memory is used
            # run the task
            try:
                total = 'total ' + task_func.__name__
                with monitor(total, task=tsk, autoflush=True):
                    return task_func(*args)
            finally:
                # save on the db
                CacheInserter.flushall()
                # the task finished, we can remove from the performance
                # table the associated row 'storing task id'
                models.Performance.objects.filter(
                    oq_job=job,
                    operation='storing task id',
                    task_id=tsk.request.id).delete()

    celery_queue = config.get('amqp', 'celery_queue')
    f = lambda *args: safely_call(wrapped, args, pickle=True)
    f.__name__ = task_func.__name__
    f.__module__ = task_func.__module__
    tsk = task(f, queue=celery_queue)
    tsk.__func__ = tsk
    tsk.task_func = task_func
    return tsk
Ejemplo n.º 21
0
    def pre_execute(self):
        """
        In this phase, the general workflow is:
            1. Parse the exposure to get the taxonomies
            2. Parse the available risk models
            3. Validate exposure and risk models
        """
        with self.monitor('get exposure'):
            exposure = self.rc.exposure_model
            if exposure is None:
                ExposureDBWriter(self.job).serialize(
                    parsers.ExposureModelParser(self.rc.inputs['exposure']))
            self.taxonomies_asset_count = \
                self.rc.exposure_model.taxonomies_in(self.rc.region_constraint)

        with self.monitor('parse risk models'):
            self.risk_models = self.get_risk_models()

        # populate ImtTaxonomy
        imt_taxonomy_set = set()
        for rm in self.risk_models.itervalues():
            self.loss_types.update(rm.loss_types)
            for imt in rm.imts:
                imt_taxonomy_set.add((imt, rm.taxonomy))
                # insert the IMT in the db, if not already there
                models.Imt.save_new([from_string(imt)])
        for imt, taxonomy in imt_taxonomy_set:
            models.ImtTaxonomy.objects.create(
                job=self.job, imt=models.Imt.get(imt), taxonomy=taxonomy)

            # consider only the taxonomies in the risk models if
            # taxonomies_from_model has been set to True in the
            # job.ini
            if self.rc.taxonomies_from_model:
                self.taxonomies_asset_count = dict(
                    (t, count)
                    for t, count in self.taxonomies_asset_count.items()
                    if t in self.risk_models)

        for validator_class in self.validators:
            validator = validator_class(self)
            error = validator.get_error()
            if error:
                raise ValueError("""Problems in calculator configuration:
                                 %s""" % error)

        num_assets = sum(self.taxonomies_asset_count.itervalues())
        num_taxonomies = len(self.taxonomies_asset_count)
        logs.LOG.info('Considering %d assets of %d distinct taxonomies',
                      num_assets, num_taxonomies)
        self.eps_sampling = int(config.get('risk', 'epsilon_sampling'))
Ejemplo n.º 22
0
def oqtask(task_func):
    """
    Task function decorator which sets up logging and catches (and logs) any
    errors which occur inside the task. Also checks to make sure the job is
    actually still running. If it is not running, the task doesn't get
    executed, so we don't do useless computation.

    :param task_func: the function to decorate
    """
    def wrapped(*args):
        """
        Initialize logs, make sure the job is still running, and run the task
        code surrounded by a try-except. If any error occurs, log it as a
        critical failure.
        """
        # job_id is always assumed to be the first argument
        job_id = args[0]
        job = models.OqJob.objects.get(id=job_id)
        if job.is_running is False:
            # the job was killed, it is useless to run the task
            raise JobNotRunning(job_id)

        # it is important to save the task id soon, so that
        # the revoke functionality can work
        EnginePerformanceMonitor.store_task_id(job_id, tsk)

        with EnginePerformanceMonitor(
                'total ' + task_func.__name__, job_id, tsk, flush=True):
            # tasks write on the celery log file
            logs.set_level(job.log_level)
            try:
                # log a warning if too much memory is used
                check_mem_usage(SOFT_MEM_LIMIT, HARD_MEM_LIMIT)
                # run the task
                return task_func(*args)
            finally:
                # save on the db
                CacheInserter.flushall()
                # the task finished, we can remove from the performance
                # table the associated row 'storing task id'
                models.Performance.objects.filter(
                    oq_job=job,
                    operation='storing task id',
                    task_id=tsk.request.id).delete()
    celery_queue = config.get('amqp', 'celery_queue')
    f = lambda *args: safely_call(wrapped, args, pickle=True)
    f.__name__ = task_func.__name__
    tsk = task(f, queue=celery_queue)
    tsk.task_func = task_func
    return tsk
Ejemplo n.º 23
0
 def check_nodes(self):
     """
     Check that the expected celery nodes are all up. The loop
     continues until the main thread keeps running.
     """
     while self.job_is_running(sleep=self.interval):
         live_nodes = self.ping(timeout=self.interval)
         if live_nodes < self.live_nodes:
             dead_nodes = list(self.live_nodes - live_nodes)
             logs.LOG.critical(
                 'Cluster nodes not accessible: %s', dead_nodes)
             terminate = boolean(
                 config.get('celery', 'terminate_job_when_celery_is_down'))
             if terminate:
                 os.kill(os.getpid(), signal.SIGABRT)  # commit suicide
Ejemplo n.º 24
0
    def task_arg_gen(self):
        """
        Generator function for creating the arguments for each task.

        It is responsible for the distribution strategy. It divides
        the considered exposure into chunks of homogeneous assets
        (i.e. having the same taxonomy). The chunk size is given by
        the `block_size` openquake config parameter.

        :returns:
            An iterator over a list of arguments. Each contains:

            1. the job id
            2. a getter object needed to get the hazard data
            3. the needed risklib calculators
            4. the output containers to be populated
            5. the specific calculator parameter set
        """
        block_size = int(config.get('risk', 'block_size'))

        output_containers = writers.combine_builders(
            [builder(self) for builder in self.output_builders])

        num_tasks = 0
        for taxonomy, assets_nr in self.taxonomies_asset_count.items():
            asset_offsets = range(0, assets_nr, block_size)

            for offset in asset_offsets:
                with logs.tracing("getting assets"):
                    assets = models.ExposureData.objects.get_asset_chunk(
                        self.rc, taxonomy, offset, block_size)

                calculation_units = [
                    self.calculation_unit(loss_type, assets)
                    for loss_type in models.loss_types(self.risk_models)]

                num_tasks += 1
                yield [self.job.id,
                       calculation_units,
                       output_containers,
                       self.calculator_parameters]

        # sanity check to protect against future changes of the distribution
        # logic
        expected_tasks = self.expected_tasks(block_size)
        if num_tasks != expected_tasks:
            raise RuntimeError('Expected %d tasks, generated %d!' % (
                               expected_tasks, num_tasks))
Ejemplo n.º 25
0
def get_client(**kwargs):
    """
    Return a redis kvs client connection for general OpenQuake engine
    calculation usage..

    PLEASE NOTE: The 'db' argument is automatically read from the openquake.cfg
    and set. If specified in ``kwargs``, it will be overridden with the setting
    in openquake.cfg.
    """
    global __KVS_CONN_POOL
    if __KVS_CONN_POOL is None:
        cfg = config.get_section("kvs")
        # get the default db from the openquake.cfg:
        db = int(config.get('kvs', 'redis_db'))
        __KVS_CONN_POOL = redis.ConnectionPool(
            max_connections=1, host=cfg["host"], port=int(cfg["port"]), db=db)
    kwargs.update({"connection_pool": __KVS_CONN_POOL})
    return redis.Redis(**kwargs)
Ejemplo n.º 26
0
    def initialize_sources(self):
        """
        Parse source models and validate source logic trees. It also
        filters the sources far away and apply uncertainties to the
        relevant ones. As a side effect it populates the instance dictionary
        `.source_blocks_per_ltpath`. Notice that sources are automatically
        split.

        :returns:
            a list with the number of sources for each source model
        """
        logs.LOG.progress("initializing sources")
        smlt_file = self.hc.inputs['source_model_logic_tree']
        self.smlt = logictree.SourceModelLogicTree(
            file(smlt_file).read(), self.hc.base_path, smlt_file)
        sm_paths = list(self.smlt.get_sm_paths())

        nblocks = ceil(config.get('hazard', 'concurrent_tasks'), len(sm_paths))
        bs = SequenceSplitter(nblocks)

        # here we are doing a full enumeration of the source model logic tree;
        # this is not bad because for very large source models there are
        # typically very few realizations; moreover, the filtering will remove
        # most of the sources, so the memory occupation is typically low
        num_sources = []  # the number of sources per sm_lt_path
        for sm, path in sm_paths:
            smpath = tuple(path)
            source_weight_pairs = source.parse_source_model_smart(
                os.path.join(self.hc.base_path, sm),
                self.hc.sites_affected_by,
                self.smlt.make_apply_uncertainties(path),
                self.hc)
            blocks = bs.split_on_max_weight(list(source_weight_pairs))
            self.source_blocks_per_ltpath[smpath] = blocks
            n = sum(len(block) for block in blocks)
            logs.LOG.info('Found %d relevant source(s) for %s %s', n, sm, path)
            logs.LOG.info('Splitting in blocks with at maximum %d ruptures',
                          bs.max_weight)
            for i, block in enumerate(blocks, 1):
                logs.LOG.info('Block %d: %d sources, %d ruptures',
                              i, len(block), block.weight)
            num_sources.append(n)
        return num_sources
Ejemplo n.º 27
0
def do_hazard_map_post_process(job):
    """
    Create and distribute tasks for processing hazard curves into hazard maps.

    :param job:
        A :class:`openquake.engine.db.models.OqJob` which has some hazard
        curves associated with it.
    """
    logs.LOG.debug('> Post-processing - Hazard Maps')
    block_size = int(config.get('hazard', 'concurrent_tasks'))

    poes = job.hazard_calculation.poes_hazard_maps

    # Stats for debug logging:
    hazard_curve_ids = models.HazardCurve.objects.filter(
        output__oq_job=job).values_list('id', flat=True)
    logs.LOG.debug('num haz curves: %s' % len(hazard_curve_ids))

    # Limit the number of concurrent tasks to the configured concurrency level:
    block_gen = block_splitter(hazard_curve_ids, block_size)
    total_blocks = int(math.ceil(len(hazard_curve_ids) / float(block_size)))

    for i, block in enumerate(block_gen):
        logs.LOG.debug('> Hazard post-processing block, %s of %s'
                       % (i + 1, total_blocks))

        if openquake.engine.no_distribute():
            # just execute the post-processing using the plain function form of
            # the task
            for hazard_curve_id in block:
                hazard_curves_to_hazard_map_task(job.id, hazard_curve_id, poes)
        else:
            tasks = []
            for hazard_curve_id in block:
                tasks.append(hazard_curves_to_hazard_map_task.subtask(
                    (job.id, hazard_curve_id, poes)))
            results = TaskSet(tasks=tasks).apply_async()

            utils_tasks._check_exception(results)

        logs.LOG.debug('< Done Hazard Map post-processing block, %s of %s'
                       % (i + 1, total_blocks))
    logs.LOG.debug('< Done post-processing - Hazard Maps')
Ejemplo n.º 28
0
def get_client(**kwargs):
    """
    Return a redis kvs client connection for general OpenQuake engine
    calculation usage..

    PLEASE NOTE: The 'db' argument is automatically read from the openquake.cfg
    and set. If specified in ``kwargs``, it will be overridden with the setting
    in openquake.cfg.
    """
    global __KVS_CONN_POOL
    if __KVS_CONN_POOL is None:
        cfg = config.get_section("kvs")
        # get the default db from the openquake.cfg:
        db = int(config.get('kvs', 'redis_db'))
        __KVS_CONN_POOL = redis.ConnectionPool(max_connections=1,
                                               host=cfg["host"],
                                               port=int(cfg["port"]),
                                               db=db)
    kwargs.update({"connection_pool": __KVS_CONN_POOL})
    return redis.Redis(**kwargs)
Ejemplo n.º 29
0
def job_from_files(cfg_files,
                   username,
                   log_level='info',
                   exports='',
                   **extras):
    """
    Create a full job profile from a job config file.

    :param str cfg_files_path:
        Path to the job.ini files.
    :param str username:
        The user who will own this job profile and all results.
    :param str log_level:
        Desired log level.
    :param exports:
        Comma-separated sting of desired export types.
    :params extras:
        Extra parameters (used only in the tests to override the params)

    :returns:
        :class:`openquake.engine.db.models.OqJob` object
    :raises:
        `RuntimeError` if the input job configuration is not valid
    """
    from openquake.commonlib.calculators import base
    # create the current job
    job = create_job(user_name=username, log_level=log_level)
    models.JobStats.objects.create(oq_job=job)
    with logs.handle(job, log_level):
        # read calculation params and create the calculation profile
        params = readinput.get_params(cfg_files)
        params['hazard_output_id'] = None
        params['hazard_calculation_id'] = None
        params.update(extras)
        # build and validate an OqParam object
        oqparam = readinput.get_oqparam(params, calculators=base.calculators)
        oqparam.concurrent_tasks = int(config.get('celery',
                                                  'concurrent_tasks'))
        job.save_params(vars(oqparam))
        job.save()
    return job
Ejemplo n.º 30
0
    def pre_execute(self):
        """
        In this phase, the general workflow is:
            1. Parse the exposure to get the taxonomies
            2. Parse the available risk models
            3. Validate exposure and risk models
        """
        with self.monitor('get exposure'):
            self.taxonomies_asset_count = (
                self.rc.preloaded_exposure_model or
                ExposureDBWriter(self.job).serialize(
                    parsers.ExposureModelParser(self.rc.inputs['exposure']))
                ).taxonomies_in(self.rc.region_constraint)

        with self.monitor('parse risk models'):
            self.risk_models = self.get_risk_models()
            for rm in self.risk_models.itervalues():
                self.loss_types.update(rm.loss_types)

            # consider only the taxonomies in the risk models if
            # taxonomies_from_model has been set to True in the
            # job.ini
            if self.rc.taxonomies_from_model:
                self.taxonomies_asset_count = dict(
                    (t, count)
                    for t, count in self.taxonomies_asset_count.items()
                    if t in self.risk_models)

        for validator_class in self.validators:
            validator = validator_class(self)
            error = validator.get_error()
            if error:
                raise ValueError("""Problems in calculator configuration:
                                 %s""" % error)

        num_assets = sum(self.taxonomies_asset_count.itervalues())
        num_taxonomies = len(self.taxonomies_asset_count)
        logs.LOG.info('Considering %d assets of %d distinct taxonomies',
                      num_assets, num_taxonomies)
        self.eps_sampling = int(config.get('risk', 'epsilon_sampling'))
Ejemplo n.º 31
0
def do_post_process(job):
    """
    Run the GMF to hazard curve post-processing tasks for the given ``job``.

    :param job:
        A :class:`openquake.engine.db.models.OqJob` instance.
    """
    logs.LOG.debug('> Post-processing - GMFs to Hazard Curves')
    block_size = int(config.get('hazard', 'concurrent_tasks'))
    block_gen = block_splitter(gmf_post_process_arg_gen(job), block_size)

    hc = job.hazard_calculation

    # Stats for debug logging:
    n_imts = len(hc.intensity_measure_types_and_levels)
    n_sites = len(hc.points_to_compute())
    n_rlzs = models.LtRealization.objects.filter(hazard_calculation=hc).count()
    total_blocks = int(math.ceil(
        (n_imts * n_sites * n_rlzs) / float(block_size)))

    for i, block in enumerate(block_gen):
        logs.LOG.debug('> GMF post-processing block, %s of %s'
                       % (i + 1, total_blocks))

        # Run the tasks in blocks, to avoid overqueueing:
        tasks = []
        for the_args in block:
            tasks.append(gmf_to_hazard_curve_task.subtask(the_args))
        results = TaskSet(tasks=tasks).apply_async()

        # Check for Exceptions in the results and raise
        utils_tasks._check_exception(results)

        logs.LOG.debug('< Done GMF post-processing block, %s of %s'
                       % (i + 1, total_blocks))
    logs.LOG.debug('< Done post-processing - GMFs to Hazard Curves')
Ejemplo n.º 32
0
class SupervisorLogMessageConsumer(logs.AMQPLogSource):
    """
    Supervise an OpenQuake job by:

       - handling its "critical" and "error" messages
       - periodically checking that the job process is still running
    """
    # Failure counter check delay, translates to 60 seconds with the current
    # settings.
    FCC_DELAY = 60
    terminate = general.str2bool(
        config.get('celery', 'terminate_workers_on_revoke'))

    def __init__(self, job_id, job_pid, timeout=1):
        self.job_id = job_id
        job = OqJob.objects.get(id=job_id)
        self.calc_id = job.calculation.id
        if job.hazard_calculation is not None:
            self.calc_domain = 'hazard'
        else:
            self.calc_domain = 'risk'

        self.selflogger = logging.getLogger('oq.%s.%s.supervisor' %
                                            (self.calc_domain, self.calc_id))
        self.selflogger.debug('Entering supervisor for %s calc %s' %
                              (self.calc_domain, self.calc_id))
        logger_name = 'oq.%s.%s' % (self.calc_domain, self.calc_id)
        key = '%s.#' % logger_name
        super(SupervisorLogMessageConsumer, self).__init__(timeout=timeout,
                                                           routing_key=key)
        self.job_pid = job_pid
        self.joblogger = logging.getLogger(logger_name)
        self.jobhandler = logging.Handler(logging.ERROR)
        self.jobhandler.emit = self.log_callback
        self.joblogger.addHandler(self.jobhandler)
        # Failure counter check delay value
        self.fcc_delay_value = 0

    def run(self):
        """
        Wrap superclass' method just to add cleanup.
        """
        started = datetime.utcnow()
        super(SupervisorLogMessageConsumer, self).run()
        stopped = datetime.utcnow()
        self.selflogger.info(
            '%s calc %s finished in %s' %
            (self.calc_domain, self.calc_id, stopped - started))
        self.joblogger.removeHandler(self.jobhandler)
        self.selflogger.debug('Exiting supervisor for %s calc %s' %
                              (self.calc_domain, self.calc_id))

    def log_callback(self, record):
        """
        Handles messages of severe level from the supervised job.
        """
        if record.name == self.selflogger.name:
            # ignore error log messages sent by selflogger.
            # this way we don't try to kill the job if its
            # process has crashed (or has been stopped).
            # we emit selflogger's error messages from
            # timeout_callback().
            return

        terminate_job(self.job_pid)

        update_job_status(self.job_id)

        record_job_stop_time(self.job_id)

        cleanup_after_job(self.job_id, self.terminate)

        self.stop()

    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            failed_nodes = None
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                # Don't check for failed nodes if distribution is disabled.
                # In this case, we don't expect any nodes to be present, and
                # thus, there are none that can fail.
                if not openquake.engine.no_distribute():
                    failed_nodes = abort_due_to_failed_nodes(self.job_id)
                    if failed_nodes:
                        message = ("job terminated due to %s failed nodes" %
                                   failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'complete':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.debug(message)
            elif not job_status == 'complete':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status(self.job_id)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id, self.terminate)
            raise StopIteration()
Ejemplo n.º 33
0
 def concurrent_tasks(self):
     """
     For hazard calculators, the number of tasks to be in queue
     at any given time is specified in the configuration file.
     """
     return int(config.get('hazard', 'concurrent_tasks'))
Ejemplo n.º 34
0
 def concurrent_tasks(self):
     """
     For hazard calculators, the number of tasks to be in queue
     at any given time is specified in the configuration file.
     """
     return int(config.get('hazard', 'concurrent_tasks'))
Ejemplo n.º 35
0
 def point_source_block_size(self):
     """
     Similar to :meth:`block_size`, except that this parameter applies
     specifically to grouping of point sources.
     """
     return int(config.get('hazard', 'point_source_block_size'))
Ejemplo n.º 36
0
 def block_size(self):
     """
     For hazard calculators, the number of work items per task
     is specified in the configuration file.
     """
     return int(config.get("hazard", "block_size"))
Ejemplo n.º 37
0
 def block_size(self):
     """
     Number of assets handled per task.
     """
     return int(config.get('risk', 'block_size'))
Ejemplo n.º 38
0
 def open(cls):
     """Initialize the test store."""
     if TestStore._conn is not None:
         return
     TestStore._conn = redis.Redis(db=int(config.get("kvs", "test_db")))
Ejemplo n.º 39
0
            # Set up logging via amqp.
            if isinstance(calculation, models.HazardCalculation):
                logs.init_logs_amqp_send(level=job.log_level,
                                         calc_domain='hazard',
                                         calc_id=calculation.id)
            else:
                logs.init_logs_amqp_send(level=job.log_level,
                                         calc_domain='risk',
                                         calc_id=calculation.id)
            try:
                res = task_func(*args, **kwargs)
            except Exception, err:
                logs.LOG.critical('Error occurred in task: %s', err)
                logs.LOG.exception(err)
                raise
            else:
                return res
            finally:
                CacheInserter.flushall()
                # the task finished, we can remove from the performance
                # table the associated row 'storing task id', then the
                # supervisor will not try revoke it without need
                models.Performance.objects.filter(
                    oq_job=job,
                    operation='storing task id',
                    task_id=tsk.request.id).delete()

    celery_queue = config.get('amqp', 'celery_queue')
    tsk = task(wrapped, ignore_result=True, queue=celery_queue)
    return tsk
Ejemplo n.º 40
0
# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake.  If not, see <http://www.gnu.org/licenses/>.
"""Utility functions related to splitting work into tasks."""

from celery.result import ResultSet
from celery.app import current_app
from celery.task import task

from openquake.commonlib.parallel import \
    TaskManager, safely_call, check_mem_usage
from openquake.engine import logs
from openquake.engine.db import models
from openquake.engine.utils import config
from openquake.engine.writer import CacheInserter

CONCURRENT_TASKS = int(config.get('celery', 'concurrent_tasks'))

SOFT_MEM_LIMIT = int(config.get('memory', 'soft_mem_limit'))
HARD_MEM_LIMIT = int(config.get('memory', 'hard_mem_limit'))
check_mem_usage.__defaults__ = (SOFT_MEM_LIMIT, HARD_MEM_LIMIT)


class JobNotRunning(Exception):
    pass


class OqTaskManager(TaskManager):
    """
    A celery-based task manager. The usage is::

      oqm = OqTaskManager(do_something, logs.LOG.progress)
Ejemplo n.º 41
0
 def concurrent_tasks(self):
     """
     Number of tasks to be in queue at any given time.
     """
     return int(config.get('risk', 'concurrent_tasks'))
Ejemplo n.º 42
0
                logs.init_logs_amqp_send(level=job.log_level,
                                         calc_domain='hazard',
                                         calc_id=calculation.id)
            else:
                logs.init_logs_amqp_send(level=job.log_level,
                                         calc_domain='risk',
                                         calc_id=calculation.id)

            try:
                # Tasks can be used in the `execute` or `post-process` phase
                if job.is_running is False:
                    raise JobCompletedError('Job %d was killed' % job_id)
                elif job.status not in ('executing', 'post_processing'):
                    raise JobCompletedError(
                        'The status of job %d is %s, should be executing or '
                        'post_processing' % (job_id, job.status))
                # else continue with task execution
                res = task_func(*args, **kwargs)
            # TODO: should we do something different with JobCompletedError?
            except Exception, err:
                logs.LOG.critical('Error occurred in task: %s', err)
                logs.LOG.exception(err)
                raise
            else:
                return res
            finally:
                CacheInserter.flushall()
    celery_queue = config.get('amqp', 'celery_queue')
    tsk = task(wrapped, ignore_result=True, queue=celery_queue)
    return tsk
Ejemplo n.º 43
0
"""Engine: A collection of fundamental functions for initializing and running
calculations."""

import sys
import traceback

from openquake.baselib.performance import Monitor
from openquake.commonlib import valid
from openquake.commonlib.oqvalidation import OqParam
from openquake.calculators import base
from openquake.engine import logs
from openquake.engine.utils import config, tasks

TERMINATE = valid.boolean(
    config.get('celery', 'terminate_workers_on_revoke') or 'false')

USE_CELERY = valid.boolean(config.get('celery', 'use_celery') or 'false')

if USE_CELERY:
    import celery.task.control

    def set_concurrent_tasks_default():
        """
        Set the default for concurrent_tasks to twice the number of workers.
        Returns the number of live celery nodes (i.e. the number of machines).
        """
        stats = celery.task.control.inspect(timeout=1).stats()
        if not stats:
            sys.exit("No live compute nodes, aborting calculation")
        num_cores = sum(stats[k]['pool']['max-concurrency'] for k in stats)
Ejemplo n.º 44
0
from openquake.engine.performance import EnginePerformanceMonitor
from openquake.engine.writer import CacheInserter
from openquake.engine.settings import DATABASES
from openquake.engine.db.models import Performance
from openquake.engine.db.schema.upgrades import upgrader

from openquake import hazardlib, risklib, commonlib

from openquake.commonlib import readinput, valid

INPUT_TYPES = set(dict(models.INPUT_TYPE_CHOICES))

UNABLE_TO_DEL_HC_FMT = 'Unable to delete hazard calculation: %s'
UNABLE_TO_DEL_RC_FMT = 'Unable to delete risk calculation: %s'

TERMINATE = valid.boolean(config.get('celery', 'terminate_workers_on_revoke'))


class InvalidHazardCalculationID(Exception):
    pass


RISK_HAZARD_MAP = dict(scenario_risk=['scenario'],
                       scenario_damage=['scenario'],
                       classical_risk=['classical'],
                       classical_bcr=['classical'],
                       classical_damage=['classical'],
                       event_based_risk=['event_based'],
                       event_based_bcr=['event_based'])

Ejemplo n.º 45
0
 def block_size(self):
     """
     For hazard calculators, the number of work items per task
     is specified in the configuration file.
     """
     return int(config.get('hazard', 'block_size'))
Ejemplo n.º 46
0
 def block_size(self):
     """
     Number of assets handled per task.
     """
     return int(config.get('risk', 'block_size'))
Ejemplo n.º 47
0
def test_task(func, *args, **kwargs):
    kwargs['queue'] = config.get('amqp', 'celery_queue')
    return task(func, *args, **kwargs)
Ejemplo n.º 48
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>.
"""Utility functions related to splitting work into tasks."""
import types

from openquake.baselib.performance import Monitor
from openquake.commonlib import parallel, valid
from openquake.engine import logs
from openquake.engine.utils import config

litetask = parallel.litetask
celery_queue = config.get('amqp', 'celery_queue')
SOFT_MEM_LIMIT = int(config.get('memory', 'soft_mem_limit'))
HARD_MEM_LIMIT = int(config.get('memory', 'hard_mem_limit'))
USE_CELERY = valid.boolean(config.get('celery', 'use_celery') or 'false')
parallel.check_mem_usage.__defaults__ = (Monitor(), SOFT_MEM_LIMIT,
                                         HARD_MEM_LIMIT)

if USE_CELERY:
    from celery.result import ResultSet
    from celery.app import current_app
    from celery.task import task

    class OqTaskManager(parallel.TaskManager):
        """
        A celery-based task manager. The usage is::
Ejemplo n.º 49
0
from openquake import risklib
from openquake import nrmllib

from openquake.commonlib import readini, valid


INPUT_TYPES = set(dict(models.INPUT_TYPE_CHOICES))

UNABLE_TO_DEL_HC_FMT = "Unable to delete hazard calculation: %s"
UNABLE_TO_DEL_RC_FMT = "Unable to delete risk calculation: %s"

LOG_FORMAT = (
    "[%(asctime)s %(job_type)s job #%(job_id)s %(hostname)s " "%(levelname)s %(processName)s/%(process)s] %(message)s"
)

TERMINATE = valid.boolean(config.get("celery", "terminate_workers_on_revoke"))


def cleanup_after_job(job, terminate):
    """
    Release the resources used by an openquake job.
    In particular revoke the running tasks (if any).

    :param int job_id: the job id
    :param bool terminate: the celery revoke command terminate flag
    """
    # Using the celery API, terminate and revoke and terminate any running
    # tasks associated with the current job.
    task_ids = Performance.objects.filter(oq_job=job, operation="storing task id", task_id__isnull=False).values_list(
        "task_id", flat=True
    )
Ejemplo n.º 50
0
 def concurrent_tasks(self):
     """
     Number of tasks to be in queue at any given time.
     """
     return int(config.get('risk', 'concurrent_tasks'))
Ejemplo n.º 51
0
from openquake.engine.calculators import base
from openquake.engine.calculators.risk import \
    writers, validation, hazard_getters
from openquake.engine.utils import config, tasks
from openquake.engine.performance import EnginePerformanceMonitor
from openquake.engine.input.exposure import ExposureDBWriter

MEMORY_ERROR = '''Running the calculation will require approximately
%dM, i.e. more than the memory which is available right now (%dM).
Please increase the free memory or apply a stringent region
constraint to reduce the number of assets. Alternatively you can set
epsilon_sampling in openquake.cfg. It the correlation is
nonzero, consider setting asset_correlation=0 to avoid building the
correlation matrix.'''

eps_sampling = int(config.get('risk', 'epsilon_sampling'))


@tasks.oqtask
def prepare_risk(counts_taxonomy, calc, monitor):
    """
    Associates the assets to the closest hazard sites and populate
    the table asset_site. For some calculators also initializes the
    epsilon matrices and save them on the database.

    :param counts_taxonomy:
        a sorted list of pairs (counts, taxonomy) for each bunch of assets
    :param calc:
        the current risk calculator
    :param monitor:
        monitor of the current risk job