Exemple #1
0
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get('worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker"
                     % (settings.get('worker'), e))
            self.job = Job(name=rule.name,
                           master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
Exemple #2
0
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get("server", settings.get("server")))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get("worker").rpartition(".")
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name, master=self.disco.master, worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get("worker"), e))
            self.job = Job(name=rule.name, master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
Exemple #3
0
 def check_results(self, jobname, start_time, timeout, poll_interval):
     try:
         status, results = self.results(jobname, timeout=poll_interval)
     except CommError as e:
         status = 'active'
     if status == 'ready':
         return results
     if status != 'active':
         raise JobError(Job(name=jobname, master=self),
                        "Status {0}".format(status))
     if timeout and time.time() - start_time > timeout:
         raise JobError(Job(name=jobname, master=self), "Timeout")
     raise Continue()
Exemple #4
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [
        (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])
    ]

    job.pipeline = [("split",
                     Stage("kmeans_predict",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=predict_map))]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemple #5
0
    def profile_stats(self, jobname, mode='', stream=sys.stdout):
        """
        Returns results of job profiling.
        :ref:`jobdict` must have had the ``profile`` flag enabled.

        :type  mode: 'map' or 'reduce' or ''
        :param mode: restricts results to the map or reduce phase, or not.

        :type  stream: file-like object
        :param stream: alternate output stream.
                       See the `pstats.Stats constructor <http://docs.python.org/library/profile.html#pstats.Stats>`_.

        The function returns a `pstats.Stats object <http://docs.python.org/library/profile.html#the-stats-class>`_.
        For instance, you can print out results as follows::

                job.profile_stats().sort_stats('cumulative').print_stats()

        .. versionadded:: 0.2.1
        """
        prefix = 'profile-%s' % mode
        f = [s for s in self.oob_list(jobname) if s.startswith(prefix)]
        if not f:
            raise JobError(Job(name=jobname, master=self), "No profile data")

        import pstats
        stats = pstats.Stats(Stats(self.oob_get(jobname, f[0])), stream=stream)
        for s in f[1:]:
            stats.add(Stats(self.oob_get(jobname, s)))
        stats.strip_dirs()
        stats.sort_stats('cumulative')
        return stats
Exemple #6
0
    def new_job(self, name, **jobargs):
        """
        Submits a new job request to the master using :class:`disco.job.Job`::

                return Job(name=name, master=self.master).run(**jobargs)
        """
        return Job(name=name, master=self.master).run(**jobargs)
Exemple #7
0
def get(program, key, jobname):
    """Usage: key jobname

    Print the oob value for the given key and jobname.
    """
    from disco.job import Job
    print(Job(name=program.job_history(jobname), master=program.disco).oob_get(key))
Exemple #8
0
def oob(program, jobname):
    """Usage: jobname

    Print the oob keys for the named job.
    """
    from disco.job import Job
    for key in Job(name=jobname, master=program.disco).oob_list():
        print(key)
Exemple #9
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [(i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])]

    job.pipeline = [
        (
            "split",
            Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map),
        )
    ]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemple #10
0
    def wait(self,
             jobname,
             poll_interval=2,
             timeout=None,
             clean=False,
             show=None):
        """
        Block until the job has finished.
        Returns a list of the result urls.

        :type  poll_interval: int
        :param poll_interval: the number of seconds between job status requests.

        :type  timeout: int or None
        :param timeout: if specified, the number of seconds before returning or
                        raising a :class:`disco.JobError`.

        :type  clean: bool
        :param clean: if `True`,
                      call :meth:`Disco.clean` when the job has finished.

                      .. deprecated:: 0.4

        :type  show: bool or string
        :param show: enables console output of job events.
                     The default is provided by :envvar:`DISCO_EVENTS`.

                     .. versionadded:: 0.2.3
        """
        if show is None:
            show = self.settings['DISCO_EVENTS']
        event_monitor = EventMonitor(Job(name=jobname, master=self.master),
                                     format=show,
                                     poll_interval=poll_interval)
        start_time = time.time()
        try:
            while True:
                event_monitor.refresh()
                try:
                    return self.check_results(jobname, start_time, timeout,
                                              poll_interval * 1000)
                except Continue:
                    continue
                finally:
                    if clean:
                        self.clean(jobname)
                    event_monitor.refresh()
        finally:
            event_monitor.cleanup()
Exemple #11
0
class InfernoJob(object):
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get("server", settings.get("server")))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get("worker").rpartition(".")
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name, master=self.disco.master, worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get("worker"), e))
            self.job = Job(name=rule.name, master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)

    @property
    def job_name(self):
        return self.job.name

    @property
    def rule_name(self):
        return self.rule.qualified_name

    def start(self):
        # process the map-results option (ie. skip map phase and grab map results from job id/ddfs
        self.archiver = self._determine_job_blobs()
        job_blobs = self.archiver.job_blobs
        # print "BLOOBS: %s" % job_blobs
        self.start_time = time.time()
        if self.settings.get("just_query"):
            self.query()
            return None
        if self._enough_blobs(len(job_blobs)):
            if self.rule.rule_init_function:
                self.rule.rule_init_function(self.params)
            self.job.run(
                name=self.rule.name,
                input=job_blobs,
                map=self.rule.map_function,
                reduce=self.rule.reduce_function,
                params=self.params,
                partitions=self.rule.partitions,
                map_input_stream=self.rule.map_input_stream,
                map_output_stream=self.rule.map_output_stream,
                map_init=self.rule.map_init_function,
                save=self.rule.save or self.rule.result_tag is not None,
                scheduler=self.rule.scheduler,
                combiner=self.rule.combiner_function,
                reduce_output_stream=self.rule.reduce_output_stream,
                sort=self.rule.sort,
                sort_buffer_size=self.rule.sort_buffer_size,
                profile=self.settings.get("profile"),
                partition=self.rule.partition_function,
                required_files=self.rule.required_files,
                required_modules=self.rule.required_modules,
            )
            # actual id is only assigned after starting the job
            self.full_job_id = self.job.name
            return self.job
        return None

    def query(self):
        log.info("Query information:")
        pprint.pprint(
            {
                "source query": self.archiver.tags,
                "tag results": self.archiver.tag_map,
                "total_blobs": self.archiver.blob_count,
            }
        )

    def _safe_str(self, value):
        try:
            return str(value)
        except UnicodeEncodeError:
            return unicode(value).encode("utf-8")

    def wait(self):
        blob_count = self.archiver.blob_count
        log.info("Started job %s processing %i blobs", self.job.name, blob_count)
        self._notify(JOB_WAIT)
        try:
            jobout = self.job.wait()
            log.info("Done waiting for job %s", self.job.name)
            self._profile(self.job)
            self._tag_results(self.job.name)
            if not self.settings.get("debug"):
                self._process_results(jobout, self.job.name)
            else:
                results = self._get_job_results(jobout)
                reduce_result(results)
            self._purge(self._safe_str(self.job.name))
        except Exception as e:
            log.error("Job %s failed", self.job.name)
            self._notify(JOB_ERROR)
            if self.rule.notify_on_fail:
                try:
                    from inferno.lib.notifications import send_mail

                    send_mail(
                        job_id=self.job.name,
                        job_fail=e,
                        mail_to=self.rule.notify_addresses,
                        mail_from=self.settings.get("mail_from"),
                        mail_server=self.settings.get("mail_server"),
                    )
                except Exception as e:
                    log.error("Job %s failed notification: %s", self.job.name, e, exc_info=sys.exc_info())
            raise
        else:
            if not self.settings.get("debug"):
                self._archive_tags(self.archiver)
            if self.rule.rule_cleanup:
                self._notify(JOB_CLEANUP)
                self.rule.rule_cleanup(self)
            self._notify(JOB_DONE)
            if self.rule.notify_on_success:
                try:
                    from inferno.lib.notifications import send_mail

                    msg = "Job %s finished successfully." % self.job.name
                    send_mail(
                        job_id=self.job.name,
                        job_fail=msg,
                        mail_to=self.rule.notify_addresses,
                        mail_from=self.settings.get("mail_from"),
                        mail_server=self.settings.get("mail_server"),
                    )
                except Exception as e:
                    log.error("Job %s failed notification: %s", self.job.name, e, exc_info=sys.exc_info())
        log.info("Finished job %s", self.job.name)

    def _determine_job_blobs(self):
        self._notify(JOB_BLOBS)
        tags = self.job_options.tags
        urls = self.job_options.urls + self.urls if self.urls else self.job_options.urls
        if tags or urls:
            log.info("Processing input: %s...", (tags + urls)[:1000])
        else:
            log.info("No input available for %s." % self.rule.name)
        archiver = Archiver(
            ddfs=self.ddfs,
            archive_prefix=self.rule.archive_tag_prefix,
            archive_mode=self.rule.archive,
            max_blobs=self.rule.max_blobs,
            tags=tags,
            urls=urls,
            newest_first=self.rule.newest_first,
        )
        return archiver

    def _get_job_results(self, jobout):
        if self.rule.result_iterator:
            self._notify(JOB_RESULTS)
            return self.rule.result_iterator(jobout)

    def _profile(self, job):
        if self.settings.get("profile"):
            self._notify(JOB_PROFILE)
            job.profile_stats().sort_stats("cumulative").print_stats()

    def _tag_results(self, job_name):
        if self.job_options.result_tag:
            self._notify(JOB_TAG)
            result_name = "disco:job:results:%s" % job_name
            suffix = job_name
            # try to guess a better suffix (ie. the date)
            # sort the tags the job ran on, take the last part of the last tag
            # if that looks like a date, use it, otherwise use the job name
            if self.rule.result_tag_suffix:
                if str(self.rule.result_tag_suffix).lower() == "date":
                    suffix = str(datetime.now().date())
                else:
                    tags = sorted(self.job_options.tags)
                    date = (tags[-1].split(":"))[-1]
                    if len(date) == 10 and "-" in date:
                        suffix = date
            tag_name = "%s:%s" % (self.job_options.result_tag, suffix)
            log.info("Tagging result: %s", tag_name)
            try:
                self.ddfs.tag(tag_name, list(self.ddfs.blobs(result_name)))
            except Exception as e:
                log.error("Error tagging result %s", tag_name)
                raise

    def _process_results(self, jobout, job_id):
        if self.rule.result_processor:
            self._notify(JOB_PROCESS)
            results = self._get_job_results(jobout)
            self.rule.result_processor(results, params=self.params, job_id=job_id)

    def _purge(self, job_name):
        if not self.settings.get("no_purge"):
            self._notify(JOB_PURGE)
            self.disco.purge(job_name)

    def _archive_tags(self, archiver):
        if archiver.archive_mode:
            self._notify(JOB_ARCHIVE)
            archiver.archive()

    def _notify(self, stage):
        # if we are daemon spawn, tell mommy where we are
        if self.full_job_id:
            log.info("Worker: %s stage= %s " % (self.full_job_id, stage))

    def _enough_blobs(self, blob_count):
        # Note that argument blob_count is the total number of tag blobs and urls.
        # To take urls into account, if no tag specified but urls are available,
        # let it run
        if len(self.job_options.tags) == 0:
            if blob_count:
                return True
            else:
                log.info("Skipping job %s: %d blobs required, has %d", self.rule.name, self.rule.min_blobs, blob_count)
                return False

        if not blob_count or (blob_count < self.rule.min_blobs and not self.settings.get("force")):
            log.info("Skipping job %s: %d blobs required, has %d", self.rule.name, self.rule.min_blobs, blob_count)
            return False
        return True

    def __str__(self):
        return "<InfernoJob for: %s>" % self.rule.name
Exemple #12
0
def fit(dataset,
        n_clusters=5,
        max_iterations=10,
        random_state=None,
        save_results=True,
        show=False):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    try:
        n_clusters = int(n_clusters)
        max_iterations = int(max_iterations)
        if n_clusters < 2:
            raise Exception("Parameter n_clusters should be greater than 1.")
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("kmeans_init_map",
                           input_chain=dataset.params["input_chain"],
                           init=map_init,
                           process=random_init_map)),
                    ('group_label',
                     Stage("kmeans_init_reduce",
                           process=estimate_reduce,
                           init=simple_init,
                           combine=True))]
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params['seed'] = random_state
    job.params['k'] = n_clusters

    job.run(input=dataset.params["data_tag"], name="kmeans_init")
    init = job.wait(show=show)
    centers = [(i, c) for i, c in result_iterator(init)]

    for j in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        job.params = dict(dataset.params.items() + mean_point_center.items())
        job.params['k'] = n_clusters
        job.params['centers'] = centers

        job.pipeline = [('split',
                         Stage("kmeans_map_iter_%s" % (j + 1, ),
                               input_chain=dataset.params["input_chain"],
                               process=estimate_map,
                               init=simple_init)),
                        ('group_label',
                         Stage("kmeans_reduce_iter_%s" % (j + 1, ),
                               process=estimate_reduce,
                               init=simple_init,
                               combine=True))]

        job.run(input=dataset.params["data_tag"],
                name='kmeans_iter_%d' % (j + 1, ))
        fitmodel_url = job.wait(show=show)
        centers = [(i, c) for i, c in result_iterator(fitmodel_url)]

    return {"kmeans_fitmodel": fitmodel_url}  # return results url
Exemple #13
0
class InfernoJob(object):
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get('worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker"
                     % (settings.get('worker'), e))
            self.job = Job(name=rule.name,
                           master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)

    @property
    def job_name(self):
        return self.job.name

    @property
    def rule_name(self):
        return self.rule.qualified_name

    def start(self):
        # process the map-results option (ie. skip map phase and grab map results from job id/ddfs
        self.archiver = self._determine_job_blobs()
        job_blobs = self.archiver.job_blobs
        #print "BLOOBS: %s" % job_blobs
        self.start_time = time.time()
        if self.settings.get('just_query'):
            self.query()
            return None
        if self._enough_blobs(len(job_blobs)):
            if self.rule.rule_init_function:
                self.rule.rule_init_function(self.params)
            self.job.run(name=self.rule.name,
                         input=job_blobs,
                         map=self.rule.map_function,
                         reduce=self.rule.reduce_function,
                         params=self.params,
                         partitions=self.rule.partitions,
                         map_input_stream=self.rule.map_input_stream,
                         map_output_stream=self.rule.map_output_stream,
                         map_init=self.rule.map_init_function,
                         save=self.rule.save or self.rule.result_tag is not None,
                         scheduler=self.rule.scheduler,
                         combiner=self.rule.combiner_function,
                         reduce_output_stream=self.rule.reduce_output_stream,
                         sort=self.rule.sort,
                         sort_buffer_size=self.rule.sort_buffer_size,
                         profile=self.settings.get('profile'),
                         partition=self.rule.partition_function,
                         required_files=self.rule.required_files,
                         required_modules=self.rule.required_modules)
            # actual id is only assigned after starting the job
            self.full_job_id = self.job.name
            return self.job
        return None

    def query(self):
        log.info("Query information:")
        pprint.pprint({'source query': self.archiver.tags,
                       'tag results': self.archiver.tag_map,
                       'total_blobs': self.archiver.blob_count})

    def _safe_str(self, value):
        try:
            return str(value)
        except UnicodeEncodeError:
            return unicode(value).encode('utf-8')

    def wait(self):
        blob_count = self.archiver.blob_count
        log.info('Started job %s processing %i blobs',
                 self.job.name, blob_count)
        self._notify(JOB_WAIT)
        try:
            jobout = self.job.wait()
            log.info('Done waiting for job %s', self.job.name)
            self._profile(self.job)
            self._tag_results(self.job.name)
            if not self.settings.get('debug'):
                try_to_execute(partial(self._process_results, jobout, self.job.name))
            else:
                results = self._get_job_results(jobout)
                reduce_result(results)
            self._purge(self._safe_str(self.job.name))
        except Exception as e:
            log.error('Job %s failed with %s', self.job.name, e.message)
            self._notify(JOB_ERROR)
            if self.rule.notify_on_fail:
                import traceback
                exc = traceback.format_exc(15)
                try:
                    from inferno.lib.notifications import send_mail
                    send_mail(job_id=self.job.name, job_fail=exc,
                              mail_to=self.rule.notify_addresses,
                              mail_from=self.settings.get('mail_from'),
                              mail_server=self.settings.get('mail_server'),
                              retry=self.rule.retry,
                              retry_delay=self.rule.retry_delay)
                except Exception as mail_ex:
                    log.error(
                        'Mail notification failed for %s: %s',
                        self.job.name, mail_ex, exc_info=sys.exc_info())

                if self.rule.notify_pagerduty:
                    if not self.rule.notify_pagerduty_key:
                        api_key = self.settings.get('pagerduty_api_key')
                    else:
                        api_key = self.rule.notify_pagerduty_key
                    try:
                        from inferno.lib.notifications import send_pagerduty
                        send_pagerduty(job_id=self.job.name, job_fail=exc,
                                       api_key=api_key, retry=self.rule.retry,
                                       retry_delay=self.rule.retry_delay)
                    except Exception as pd_ex:
                        log.error(
                            "Pagerduty notification failed for %s: %s",
                            self.job.name, pd_ex)
            raise
        else:
            if not self.settings.get('debug'):
                try_to_execute(partial(self._archive_tags, self.archiver))

            if self.rule.rule_cleanup:
                self._notify(JOB_CLEANUP)
                self.rule.rule_cleanup(self, )
            self._notify(JOB_DONE)
            if self.rule.notify_on_success:
                try:
                    from inferno.lib.notifications import send_mail
                    msg = "Job %s finished successfully." % self.job.name
                    send_mail(job_id=self.job.name, job_fail=msg,
                              mail_to=self.rule.notify_addresses,
                              mail_from=self.settings.get('mail_from'),
                              mail_server=self.settings.get('mail_server'))
                except Exception as e:
                    log.error('Job %s failed notification: %s', self.job.name, e, exc_info=sys.exc_info())
        log.info('Finished job %s', self.job.name)

    def _determine_job_blobs(self):
        self._notify(JOB_BLOBS)
        tags = self.job_options.tags
        urls = self.job_options.urls + self.urls if self.urls else self.job_options.urls
        if tags or urls:
            log.info('Processing input: %s...', (tags + urls)[:1000])
        else:
            log.info('No input available for %s.' % self.rule.name)
        archiver = Archiver(
            ddfs=self.ddfs,
            archive_prefix=self.rule.archive_tag_prefix,
            archive_mode=self.rule.archive,
            max_blobs=self.rule.max_blobs,
            tags=tags,
            urls=urls,
            newest_first=self.rule.newest_first,
        )
        return archiver

    def _get_job_results(self, jobout):
        if self.rule.result_iterator:
            self._notify(JOB_RESULTS)
            return self.rule.result_iterator(jobout)

    def _profile(self, job):
        if self.settings.get('profile'):
            self._notify(JOB_PROFILE)
            job.profile_stats().sort_stats('cumulative').print_stats()

    def _tag_results(self, job_name):
        if self.job_options.result_tag:
            self._notify(JOB_TAG)
            
            # note that Disco changed it's base tag name for saved results during transition to version 0.5
            base_tag = 'disco:results:%s' if self.disco.master_version() >= '0.5' else 'disco:job:results:%s'
            result_name = base_tag % job_name
            suffix = job_name
            # try to guess a better suffix (ie. the date)
            # sort the tags the job ran on, take the last part of the last tag
            # if that looks like a date, use it, otherwise use the job name
            if self.rule.result_tag_suffix:
                if str(self.rule.result_tag_suffix).lower() == "date":
                    suffix = str(datetime.now().date())
                else:
                    if len(self.job_options.tags):
                        tags = sorted(self.job_options.tags)
                        date = (tags[-1].split(':'))[-1]
                        if len(date) == 10 and '-' in date:
                            suffix = date
            tag_name = '%s:%s' % (self.job_options.result_tag, suffix)
            log.info('Tagging result: %s', tag_name)
            try:
                try_to_execute(partial(self.ddfs.tag, tag_name,
                                       list(self.ddfs.blobs(result_name))))
            except Exception:
                log.error('Error tagging result %s', tag_name)
                raise

    def _process_results(self, jobout, job_id):
        if self.rule.result_processor:
            self._notify(JOB_PROCESS)
            results = self._get_job_results(jobout)
            self.rule.result_processor(
                results, params=self.params, job_id=job_id)

    def _purge(self, job_name):
        if not self.settings.get('no_purge'):
            self._notify(JOB_PURGE)
            self.disco.purge(job_name)

    def _archive_tags(self, archiver):
        if archiver.archive_mode:
            self._notify(JOB_ARCHIVE)
            archiver.archive()

    def _notify(self, stage):
        # if we are daemon spawn, tell mommy where we are
        if self.full_job_id:
            log.info("Worker: %s stage= %s " % (self.full_job_id, stage))

    def _enough_blobs(self, blob_count):
        # Note that argument blob_count is the total number of tag blobs and urls.
        # To take urls into account, if no tag specified but urls are available,
        # let it run
        if len(self.job_options.tags) == 0:
            if blob_count:
                return True
            else:
                log.info('Skipping job %s: %d blobs required, has %d',
                         self.rule.name, self.rule.min_blobs, blob_count)
                return False

        if not blob_count or (blob_count < self.rule.min_blobs and not self.settings.get('force')):
            log.info('Skipping job %s: %d blobs required, has %d',
                     self.rule.name, self.rule.min_blobs, blob_count)
            return False
        return True

    def __str__(self):
        return '<InfernoJob for: %s>' % self.rule.name
Exemple #14
0
        return others, active

    def jobinfo(self, jobname):
        """Returns a dict containing information about the job."""
        return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' %
                                       jobname))

    def check_results(self, jobname, start_time, timeout, poll_interval):
        try:
            status, results = self.results(jobname, timeout=poll_interval)
        except CommError, e:
            status = 'active'
        if status == 'ready':
            return results
        if status != 'active':
            raise JobError(Job(name=jobname, master=self),
                           "Status %s" % status)
        if timeout and time.time() - start_time > timeout:
            raise JobError(Job(name=jobname, master=self), "Timeout")
        raise Continue()

    def wait(self,
             jobname,
             poll_interval=2,
             timeout=None,
             clean=False,
             show=None):
        """
        Block until the job has finished.
        Returns a list of the result urls.
Exemple #15
0
                others.append((jobname, (status, result)))
        return others, active

    def jobinfo(self, jobname):
        """Returns a dict containing information about the job."""
        return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' % jobname))

    def check_results(self, jobname, start_time, timeout, poll_interval):
        try:
            status, results = self.results(jobname, timeout=poll_interval)
        except CommError, e:
            status = 'active'
        if status == 'ready':
            return results
        if status != 'active':
            raise JobError(Job(name=jobname, master=self), "Status %s" % status)
        if timeout and time.time() - start_time > timeout:
            raise JobError(Job(name=jobname, master=self), "Timeout")
        raise Continue()

    def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None):
        """
        Block until the job has finished.
        Returns a list of the result urls.

        :type  poll_interval: int
        :param poll_interval: the number of seconds between job status requests.

        :type  timeout: int or None
        :param timeout: if specified, the number of seconds before returning or
                        raising a :class:`disco.JobError`.
Exemple #16
0
class InfernoJob(object):
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get('worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker"
                     % (settings.get('worker'), e))
            self.job = Job(name=rule.name,
                           master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)

    @property
    def job_name(self):
        return self.job.name

    @property
    def rule_name(self):
        return self.rule.qualified_name

    def start(self):
        # process the map-results option (ie. skip map phase and grab map results from job id/ddfs
        self.archiver = self._determine_job_blobs()
        job_blobs = self.archiver.job_blobs
        #print "BLOOBS: %s" % job_blobs
        self.start_time = time.time()
        if self.settings.get('just_query'):
            self.query()
            return None
        if self._enough_blobs(len(job_blobs)):
            if self.rule.rule_init_function:
                self.rule.rule_init_function(self.params)
            self.job.run(name=self.rule.name,
                         input=job_blobs,
                         map=self.rule.map_function,
                         reduce=self.rule.reduce_function,
                         params=self.params,
                         partitions=self.rule.partitions,
                         map_input_stream=self.rule.map_input_stream,
                         map_output_stream=self.rule.map_output_stream,
                         map_init=self.rule.map_init_function,
                         save=self.rule.save or self.rule.result_tag is not None,
                         scheduler=self.rule.scheduler,
                         combiner=self.rule.combiner_function,
                         reduce_output_stream=self.rule.reduce_output_stream,
                         sort=self.rule.sort,
                         sort_buffer_size=self.rule.sort_buffer_size,
                         profile=self.settings.get('profile'),
                         partition=self.rule.partition_function,
                         required_files=self.rule.required_files,
                         required_modules=self.rule.required_modules)
            # actual id is only assigned after starting the job
            self.full_job_id = self.job.name
            return self.job
        return None

    def query(self):
        log.info("Query information:")
        pprint.pprint({'source query': self.archiver.tags,
                       'tag results': self.archiver.tag_map,
                       'total_blobs': self.archiver.blob_count})

    def _safe_str(self, value):
        try:
            return str(value)
        except UnicodeEncodeError:
            return unicode(value).encode('utf-8')

    def wait(self):
        blob_count = self.archiver.blob_count
        log.info('Started job %s processing %i blobs',
                 self.job.name, blob_count)
        self._notify(JOB_WAIT)
        try:
            jobout = self.job.wait()
            log.info('Done waiting for job %s', self.job.name)
            self._profile(self.job)
            self._tag_results(self.job.name)
            if not self.settings.get('debug'):
                try_to_execute(partial(self._process_results, jobout, self.job.name))
            else:
                results = self._get_job_results(jobout)
                reduce_result(results)
            self._purge(self._safe_str(self.job.name))
        except Exception as e:
            log.error('Job %s failed with %s', self.job.name, e.message)
            self._notify(JOB_ERROR)
            if self.rule.notify_on_fail:
                try:
                    from inferno.lib.notifications import send_mail
                    send_mail(job_id=self.job.name, job_fail=e,
                              mail_to=self.rule.notify_addresses,
                              mail_from=self.settings.get('mail_from'),
                              mail_server=self.settings.get('mail_server'))
                except Exception as e:
                    log.error('Job %s failed notification: %s', self.job.name, e, exc_info=sys.exc_info())
            raise
        else:
            if not self.settings.get('debug'):
                try_to_execute(partial(self._archive_tags, self.archiver))

            if self.rule.rule_cleanup:
                self._notify(JOB_CLEANUP)
                self.rule.rule_cleanup(self, )
            self._notify(JOB_DONE)
            if self.rule.notify_on_success:
                try:
                    from inferno.lib.notifications import send_mail
                    msg = "Job %s finished successfully." % self.job.name
                    send_mail(job_id=self.job.name, job_fail=msg,
                              mail_to=self.rule.notify_addresses,
                              mail_from=self.settings.get('mail_from'),
                              mail_server=self.settings.get('mail_server'))
                except Exception as e:
                    log.error('Job %s failed notification: %s', self.job.name, e, exc_info=sys.exc_info())
        log.info('Finished job %s', self.job.name)

    def _determine_job_blobs(self):
        self._notify(JOB_BLOBS)
        tags = self.job_options.tags
        urls = self.job_options.urls + self.urls if self.urls else self.job_options.urls
        if tags or urls:
            log.info('Processing input: %s...', (tags + urls)[:1000])
        else:
            log.info('No input available for %s.' % self.rule.name)
        archiver = Archiver(
            ddfs=self.ddfs,
            archive_prefix=self.rule.archive_tag_prefix,
            archive_mode=self.rule.archive,
            max_blobs=self.rule.max_blobs,
            tags=tags,
            urls=urls,
            newest_first=self.rule.newest_first,
        )
        return archiver

    def _get_job_results(self, jobout):
        if self.rule.result_iterator:
            self._notify(JOB_RESULTS)
            return self.rule.result_iterator(jobout)

    def _profile(self, job):
        if self.settings.get('profile'):
            self._notify(JOB_PROFILE)
            job.profile_stats().sort_stats('cumulative').print_stats()

    def _tag_results(self, job_name):
        if self.job_options.result_tag:
            self._notify(JOB_TAG)
            result_name = 'disco:job:results:%s' % job_name
            suffix = job_name
            # try to guess a better suffix (ie. the date)
            # sort the tags the job ran on, take the last part of the last tag
            # if that looks like a date, use it, otherwise use the job name
            if self.rule.result_tag_suffix:
                if str(self.rule.result_tag_suffix).lower() == "date":
                    suffix = str(datetime.now().date())
                else:
                    if len(self.job_options.tags):
                        tags = sorted(self.job_options.tags)
                        date = (tags[-1].split(':'))[-1]
                        if len(date) == 10 and '-' in date:
                            suffix = date
            tag_name = '%s:%s' % (self.job_options.result_tag, suffix)
            log.info('Tagging result: %s', tag_name)
            try:
                try_to_execute(partial(self.ddfs.tag, tag_name,
                                       list(self.ddfs.blobs(result_name))))
            except Exception:
                log.error('Error tagging result %s', tag_name)
                raise

    def _process_results(self, jobout, job_id):
        if self.rule.result_processor:
            self._notify(JOB_PROCESS)
            results = self._get_job_results(jobout)
            self.rule.result_processor(
                results, params=self.params, job_id=job_id)

    def _purge(self, job_name):
        if not self.settings.get('no_purge'):
            self._notify(JOB_PURGE)
            self.disco.purge(job_name)

    def _archive_tags(self, archiver):
        if archiver.archive_mode:
            self._notify(JOB_ARCHIVE)
            archiver.archive()

    def _notify(self, stage):
        # if we are daemon spawn, tell mommy where we are
        if self.full_job_id:
            log.info("Worker: %s stage= %s " % (self.full_job_id, stage))

    def _enough_blobs(self, blob_count):
        # Note that argument blob_count is the total number of tag blobs and urls.
        # To take urls into account, if no tag specified but urls are available,
        # let it run
        if len(self.job_options.tags) == 0:
            if blob_count:
                return True
            else:
                log.info('Skipping job %s: %d blobs required, has %d',
                         self.rule.name, self.rule.min_blobs, blob_count)
                return False

        if not blob_count or (blob_count < self.rule.min_blobs and not self.settings.get('force')):
            log.info('Skipping job %s: %d blobs required, has %d',
                     self.rule.name, self.rule.min_blobs, blob_count)
            return False
        return True

    def __str__(self):
        return '<InfernoJob for: %s>' % self.rule.name
Exemple #17
0
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    try:
        n_clusters = int(n_clusters)
        max_iterations = int(max_iterations)
        if n_clusters < 2:
            raise Exception("Parameter n_clusters should be greater than 1.")
        if max_iterations < 1:
            raise Exception("Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        (
            "split",
            Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map),
        ),
        ("group_label", Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True)),
    ]
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["seed"] = random_state
    job.params["k"] = n_clusters

    job.run(input=dataset.params["data_tag"], name="kmeans_init")
    init = job.wait(show=show)
    centers = [(i, c) for i, c in result_iterator(init)]

    for j in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        job.params = dict(dataset.params.items() + mean_point_center.items())
        job.params["k"] = n_clusters
        job.params["centers"] = centers

        job.pipeline = [
            (
                "split",
                Stage(
                    "kmeans_map_iter_%s" % (j + 1,),
                    input_chain=dataset.params["input_chain"],
                    process=estimate_map,
                    init=simple_init,
                ),
            ),
            (
                "group_label",
                Stage("kmeans_reduce_iter_%s" % (j + 1,), process=estimate_reduce, init=simple_init, combine=True),
            ),
        ]

        job.run(input=dataset.params["data_tag"], name="kmeans_iter_%d" % (j + 1,))
        fitmodel_url = job.wait(show=show)
        centers = [(i, c) for i, c in result_iterator(fitmodel_url)]

    return {"kmeans_fitmodel": fitmodel_url}  # return results url