def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get('worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get("server", settings.get("server"))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get("worker").rpartition(".") mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get("worker"), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def check_results(self, jobname, start_time, timeout, poll_interval): try: status, results = self.results(jobname, timeout=poll_interval) except CommError as e: status = 'active' if status == 'ready': return results if status != 'active': raise JobError(Job(name=jobname, master=self), "Status {0}".format(status)) if timeout and time.time() - start_time > timeout: raise JobError(Job(name=jobname, master=self), "Timeout") raise Continue()
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [ (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"]) ] job.pipeline = [("split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map))] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def profile_stats(self, jobname, mode='', stream=sys.stdout): """ Returns results of job profiling. :ref:`jobdict` must have had the ``profile`` flag enabled. :type mode: 'map' or 'reduce' or '' :param mode: restricts results to the map or reduce phase, or not. :type stream: file-like object :param stream: alternate output stream. See the `pstats.Stats constructor <http://docs.python.org/library/profile.html#pstats.Stats>`_. The function returns a `pstats.Stats object <http://docs.python.org/library/profile.html#the-stats-class>`_. For instance, you can print out results as follows:: job.profile_stats().sort_stats('cumulative').print_stats() .. versionadded:: 0.2.1 """ prefix = 'profile-%s' % mode f = [s for s in self.oob_list(jobname) if s.startswith(prefix)] if not f: raise JobError(Job(name=jobname, master=self), "No profile data") import pstats stats = pstats.Stats(Stats(self.oob_get(jobname, f[0])), stream=stream) for s in f[1:]: stats.add(Stats(self.oob_get(jobname, s))) stats.strip_dirs() stats.sort_stats('cumulative') return stats
def new_job(self, name, **jobargs): """ Submits a new job request to the master using :class:`disco.job.Job`:: return Job(name=name, master=self.master).run(**jobargs) """ return Job(name=name, master=self.master).run(**jobargs)
def get(program, key, jobname): """Usage: key jobname Print the oob value for the given key and jobname. """ from disco.job import Job print(Job(name=program.job_history(jobname), master=program.disco).oob_get(key))
def oob(program, jobname): """Usage: jobname Print the oob keys for the named job. """ from disco.job import Job for key in Job(name=jobname, master=program.disco).oob_list(): print(key)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [(i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])] job.pipeline = [ ( "split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map), ) ] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None): """ Block until the job has finished. Returns a list of the result urls. :type poll_interval: int :param poll_interval: the number of seconds between job status requests. :type timeout: int or None :param timeout: if specified, the number of seconds before returning or raising a :class:`disco.JobError`. :type clean: bool :param clean: if `True`, call :meth:`Disco.clean` when the job has finished. .. deprecated:: 0.4 :type show: bool or string :param show: enables console output of job events. The default is provided by :envvar:`DISCO_EVENTS`. .. versionadded:: 0.2.3 """ if show is None: show = self.settings['DISCO_EVENTS'] event_monitor = EventMonitor(Job(name=jobname, master=self.master), format=show, poll_interval=poll_interval) start_time = time.time() try: while True: event_monitor.refresh() try: return self.check_results(jobname, start_time, timeout, poll_interval * 1000) except Continue: continue finally: if clean: self.clean(jobname) event_monitor.refresh() finally: event_monitor.cleanup()
class InfernoJob(object): def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get("server", settings.get("server"))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get("worker").rpartition(".") mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get("worker"), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START) @property def job_name(self): return self.job.name @property def rule_name(self): return self.rule.qualified_name def start(self): # process the map-results option (ie. skip map phase and grab map results from job id/ddfs self.archiver = self._determine_job_blobs() job_blobs = self.archiver.job_blobs # print "BLOOBS: %s" % job_blobs self.start_time = time.time() if self.settings.get("just_query"): self.query() return None if self._enough_blobs(len(job_blobs)): if self.rule.rule_init_function: self.rule.rule_init_function(self.params) self.job.run( name=self.rule.name, input=job_blobs, map=self.rule.map_function, reduce=self.rule.reduce_function, params=self.params, partitions=self.rule.partitions, map_input_stream=self.rule.map_input_stream, map_output_stream=self.rule.map_output_stream, map_init=self.rule.map_init_function, save=self.rule.save or self.rule.result_tag is not None, scheduler=self.rule.scheduler, combiner=self.rule.combiner_function, reduce_output_stream=self.rule.reduce_output_stream, sort=self.rule.sort, sort_buffer_size=self.rule.sort_buffer_size, profile=self.settings.get("profile"), partition=self.rule.partition_function, required_files=self.rule.required_files, required_modules=self.rule.required_modules, ) # actual id is only assigned after starting the job self.full_job_id = self.job.name return self.job return None def query(self): log.info("Query information:") pprint.pprint( { "source query": self.archiver.tags, "tag results": self.archiver.tag_map, "total_blobs": self.archiver.blob_count, } ) def _safe_str(self, value): try: return str(value) except UnicodeEncodeError: return unicode(value).encode("utf-8") def wait(self): blob_count = self.archiver.blob_count log.info("Started job %s processing %i blobs", self.job.name, blob_count) self._notify(JOB_WAIT) try: jobout = self.job.wait() log.info("Done waiting for job %s", self.job.name) self._profile(self.job) self._tag_results(self.job.name) if not self.settings.get("debug"): self._process_results(jobout, self.job.name) else: results = self._get_job_results(jobout) reduce_result(results) self._purge(self._safe_str(self.job.name)) except Exception as e: log.error("Job %s failed", self.job.name) self._notify(JOB_ERROR) if self.rule.notify_on_fail: try: from inferno.lib.notifications import send_mail send_mail( job_id=self.job.name, job_fail=e, mail_to=self.rule.notify_addresses, mail_from=self.settings.get("mail_from"), mail_server=self.settings.get("mail_server"), ) except Exception as e: log.error("Job %s failed notification: %s", self.job.name, e, exc_info=sys.exc_info()) raise else: if not self.settings.get("debug"): self._archive_tags(self.archiver) if self.rule.rule_cleanup: self._notify(JOB_CLEANUP) self.rule.rule_cleanup(self) self._notify(JOB_DONE) if self.rule.notify_on_success: try: from inferno.lib.notifications import send_mail msg = "Job %s finished successfully." % self.job.name send_mail( job_id=self.job.name, job_fail=msg, mail_to=self.rule.notify_addresses, mail_from=self.settings.get("mail_from"), mail_server=self.settings.get("mail_server"), ) except Exception as e: log.error("Job %s failed notification: %s", self.job.name, e, exc_info=sys.exc_info()) log.info("Finished job %s", self.job.name) def _determine_job_blobs(self): self._notify(JOB_BLOBS) tags = self.job_options.tags urls = self.job_options.urls + self.urls if self.urls else self.job_options.urls if tags or urls: log.info("Processing input: %s...", (tags + urls)[:1000]) else: log.info("No input available for %s." % self.rule.name) archiver = Archiver( ddfs=self.ddfs, archive_prefix=self.rule.archive_tag_prefix, archive_mode=self.rule.archive, max_blobs=self.rule.max_blobs, tags=tags, urls=urls, newest_first=self.rule.newest_first, ) return archiver def _get_job_results(self, jobout): if self.rule.result_iterator: self._notify(JOB_RESULTS) return self.rule.result_iterator(jobout) def _profile(self, job): if self.settings.get("profile"): self._notify(JOB_PROFILE) job.profile_stats().sort_stats("cumulative").print_stats() def _tag_results(self, job_name): if self.job_options.result_tag: self._notify(JOB_TAG) result_name = "disco:job:results:%s" % job_name suffix = job_name # try to guess a better suffix (ie. the date) # sort the tags the job ran on, take the last part of the last tag # if that looks like a date, use it, otherwise use the job name if self.rule.result_tag_suffix: if str(self.rule.result_tag_suffix).lower() == "date": suffix = str(datetime.now().date()) else: tags = sorted(self.job_options.tags) date = (tags[-1].split(":"))[-1] if len(date) == 10 and "-" in date: suffix = date tag_name = "%s:%s" % (self.job_options.result_tag, suffix) log.info("Tagging result: %s", tag_name) try: self.ddfs.tag(tag_name, list(self.ddfs.blobs(result_name))) except Exception as e: log.error("Error tagging result %s", tag_name) raise def _process_results(self, jobout, job_id): if self.rule.result_processor: self._notify(JOB_PROCESS) results = self._get_job_results(jobout) self.rule.result_processor(results, params=self.params, job_id=job_id) def _purge(self, job_name): if not self.settings.get("no_purge"): self._notify(JOB_PURGE) self.disco.purge(job_name) def _archive_tags(self, archiver): if archiver.archive_mode: self._notify(JOB_ARCHIVE) archiver.archive() def _notify(self, stage): # if we are daemon spawn, tell mommy where we are if self.full_job_id: log.info("Worker: %s stage= %s " % (self.full_job_id, stage)) def _enough_blobs(self, blob_count): # Note that argument blob_count is the total number of tag blobs and urls. # To take urls into account, if no tag specified but urls are available, # let it run if len(self.job_options.tags) == 0: if blob_count: return True else: log.info("Skipping job %s: %d blobs required, has %d", self.rule.name, self.rule.min_blobs, blob_count) return False if not blob_count or (blob_count < self.rule.min_blobs and not self.settings.get("force")): log.info("Skipping job %s: %d blobs required, has %d", self.rule.name, self.rule.min_blobs, blob_count) return False return True def __str__(self): return "<InfernoJob for: %s>" % self.rule.name
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator try: n_clusters = int(n_clusters) max_iterations = int(max_iterations) if n_clusters < 2: raise Exception("Parameter n_clusters should be greater than 1.") if max_iterations < 1: raise Exception( "Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map)), ('group_label', Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True))] job.params = dict(dataset.params.items() + mean_point_center.items()) job.params['seed'] = random_state job.params['k'] = n_clusters job.run(input=dataset.params["data_tag"], name="kmeans_init") init = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(init)] for j in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params['k'] = n_clusters job.params['centers'] = centers job.pipeline = [('split', Stage("kmeans_map_iter_%s" % (j + 1, ), input_chain=dataset.params["input_chain"], process=estimate_map, init=simple_init)), ('group_label', Stage("kmeans_reduce_iter_%s" % (j + 1, ), process=estimate_reduce, init=simple_init, combine=True))] job.run(input=dataset.params["data_tag"], name='kmeans_iter_%d' % (j + 1, )) fitmodel_url = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(fitmodel_url)] return {"kmeans_fitmodel": fitmodel_url} # return results url
class InfernoJob(object): def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get('worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START) @property def job_name(self): return self.job.name @property def rule_name(self): return self.rule.qualified_name def start(self): # process the map-results option (ie. skip map phase and grab map results from job id/ddfs self.archiver = self._determine_job_blobs() job_blobs = self.archiver.job_blobs #print "BLOOBS: %s" % job_blobs self.start_time = time.time() if self.settings.get('just_query'): self.query() return None if self._enough_blobs(len(job_blobs)): if self.rule.rule_init_function: self.rule.rule_init_function(self.params) self.job.run(name=self.rule.name, input=job_blobs, map=self.rule.map_function, reduce=self.rule.reduce_function, params=self.params, partitions=self.rule.partitions, map_input_stream=self.rule.map_input_stream, map_output_stream=self.rule.map_output_stream, map_init=self.rule.map_init_function, save=self.rule.save or self.rule.result_tag is not None, scheduler=self.rule.scheduler, combiner=self.rule.combiner_function, reduce_output_stream=self.rule.reduce_output_stream, sort=self.rule.sort, sort_buffer_size=self.rule.sort_buffer_size, profile=self.settings.get('profile'), partition=self.rule.partition_function, required_files=self.rule.required_files, required_modules=self.rule.required_modules) # actual id is only assigned after starting the job self.full_job_id = self.job.name return self.job return None def query(self): log.info("Query information:") pprint.pprint({'source query': self.archiver.tags, 'tag results': self.archiver.tag_map, 'total_blobs': self.archiver.blob_count}) def _safe_str(self, value): try: return str(value) except UnicodeEncodeError: return unicode(value).encode('utf-8') def wait(self): blob_count = self.archiver.blob_count log.info('Started job %s processing %i blobs', self.job.name, blob_count) self._notify(JOB_WAIT) try: jobout = self.job.wait() log.info('Done waiting for job %s', self.job.name) self._profile(self.job) self._tag_results(self.job.name) if not self.settings.get('debug'): try_to_execute(partial(self._process_results, jobout, self.job.name)) else: results = self._get_job_results(jobout) reduce_result(results) self._purge(self._safe_str(self.job.name)) except Exception as e: log.error('Job %s failed with %s', self.job.name, e.message) self._notify(JOB_ERROR) if self.rule.notify_on_fail: import traceback exc = traceback.format_exc(15) try: from inferno.lib.notifications import send_mail send_mail(job_id=self.job.name, job_fail=exc, mail_to=self.rule.notify_addresses, mail_from=self.settings.get('mail_from'), mail_server=self.settings.get('mail_server'), retry=self.rule.retry, retry_delay=self.rule.retry_delay) except Exception as mail_ex: log.error( 'Mail notification failed for %s: %s', self.job.name, mail_ex, exc_info=sys.exc_info()) if self.rule.notify_pagerduty: if not self.rule.notify_pagerduty_key: api_key = self.settings.get('pagerduty_api_key') else: api_key = self.rule.notify_pagerduty_key try: from inferno.lib.notifications import send_pagerduty send_pagerduty(job_id=self.job.name, job_fail=exc, api_key=api_key, retry=self.rule.retry, retry_delay=self.rule.retry_delay) except Exception as pd_ex: log.error( "Pagerduty notification failed for %s: %s", self.job.name, pd_ex) raise else: if not self.settings.get('debug'): try_to_execute(partial(self._archive_tags, self.archiver)) if self.rule.rule_cleanup: self._notify(JOB_CLEANUP) self.rule.rule_cleanup(self, ) self._notify(JOB_DONE) if self.rule.notify_on_success: try: from inferno.lib.notifications import send_mail msg = "Job %s finished successfully." % self.job.name send_mail(job_id=self.job.name, job_fail=msg, mail_to=self.rule.notify_addresses, mail_from=self.settings.get('mail_from'), mail_server=self.settings.get('mail_server')) except Exception as e: log.error('Job %s failed notification: %s', self.job.name, e, exc_info=sys.exc_info()) log.info('Finished job %s', self.job.name) def _determine_job_blobs(self): self._notify(JOB_BLOBS) tags = self.job_options.tags urls = self.job_options.urls + self.urls if self.urls else self.job_options.urls if tags or urls: log.info('Processing input: %s...', (tags + urls)[:1000]) else: log.info('No input available for %s.' % self.rule.name) archiver = Archiver( ddfs=self.ddfs, archive_prefix=self.rule.archive_tag_prefix, archive_mode=self.rule.archive, max_blobs=self.rule.max_blobs, tags=tags, urls=urls, newest_first=self.rule.newest_first, ) return archiver def _get_job_results(self, jobout): if self.rule.result_iterator: self._notify(JOB_RESULTS) return self.rule.result_iterator(jobout) def _profile(self, job): if self.settings.get('profile'): self._notify(JOB_PROFILE) job.profile_stats().sort_stats('cumulative').print_stats() def _tag_results(self, job_name): if self.job_options.result_tag: self._notify(JOB_TAG) # note that Disco changed it's base tag name for saved results during transition to version 0.5 base_tag = 'disco:results:%s' if self.disco.master_version() >= '0.5' else 'disco:job:results:%s' result_name = base_tag % job_name suffix = job_name # try to guess a better suffix (ie. the date) # sort the tags the job ran on, take the last part of the last tag # if that looks like a date, use it, otherwise use the job name if self.rule.result_tag_suffix: if str(self.rule.result_tag_suffix).lower() == "date": suffix = str(datetime.now().date()) else: if len(self.job_options.tags): tags = sorted(self.job_options.tags) date = (tags[-1].split(':'))[-1] if len(date) == 10 and '-' in date: suffix = date tag_name = '%s:%s' % (self.job_options.result_tag, suffix) log.info('Tagging result: %s', tag_name) try: try_to_execute(partial(self.ddfs.tag, tag_name, list(self.ddfs.blobs(result_name)))) except Exception: log.error('Error tagging result %s', tag_name) raise def _process_results(self, jobout, job_id): if self.rule.result_processor: self._notify(JOB_PROCESS) results = self._get_job_results(jobout) self.rule.result_processor( results, params=self.params, job_id=job_id) def _purge(self, job_name): if not self.settings.get('no_purge'): self._notify(JOB_PURGE) self.disco.purge(job_name) def _archive_tags(self, archiver): if archiver.archive_mode: self._notify(JOB_ARCHIVE) archiver.archive() def _notify(self, stage): # if we are daemon spawn, tell mommy where we are if self.full_job_id: log.info("Worker: %s stage= %s " % (self.full_job_id, stage)) def _enough_blobs(self, blob_count): # Note that argument blob_count is the total number of tag blobs and urls. # To take urls into account, if no tag specified but urls are available, # let it run if len(self.job_options.tags) == 0: if blob_count: return True else: log.info('Skipping job %s: %d blobs required, has %d', self.rule.name, self.rule.min_blobs, blob_count) return False if not blob_count or (blob_count < self.rule.min_blobs and not self.settings.get('force')): log.info('Skipping job %s: %d blobs required, has %d', self.rule.name, self.rule.min_blobs, blob_count) return False return True def __str__(self): return '<InfernoJob for: %s>' % self.rule.name
return others, active def jobinfo(self, jobname): """Returns a dict containing information about the job.""" return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' % jobname)) def check_results(self, jobname, start_time, timeout, poll_interval): try: status, results = self.results(jobname, timeout=poll_interval) except CommError, e: status = 'active' if status == 'ready': return results if status != 'active': raise JobError(Job(name=jobname, master=self), "Status %s" % status) if timeout and time.time() - start_time > timeout: raise JobError(Job(name=jobname, master=self), "Timeout") raise Continue() def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None): """ Block until the job has finished. Returns a list of the result urls.
others.append((jobname, (status, result))) return others, active def jobinfo(self, jobname): """Returns a dict containing information about the job.""" return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' % jobname)) def check_results(self, jobname, start_time, timeout, poll_interval): try: status, results = self.results(jobname, timeout=poll_interval) except CommError, e: status = 'active' if status == 'ready': return results if status != 'active': raise JobError(Job(name=jobname, master=self), "Status %s" % status) if timeout and time.time() - start_time > timeout: raise JobError(Job(name=jobname, master=self), "Timeout") raise Continue() def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None): """ Block until the job has finished. Returns a list of the result urls. :type poll_interval: int :param poll_interval: the number of seconds between job status requests. :type timeout: int or None :param timeout: if specified, the number of seconds before returning or raising a :class:`disco.JobError`.
class InfernoJob(object): def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get('worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START) @property def job_name(self): return self.job.name @property def rule_name(self): return self.rule.qualified_name def start(self): # process the map-results option (ie. skip map phase and grab map results from job id/ddfs self.archiver = self._determine_job_blobs() job_blobs = self.archiver.job_blobs #print "BLOOBS: %s" % job_blobs self.start_time = time.time() if self.settings.get('just_query'): self.query() return None if self._enough_blobs(len(job_blobs)): if self.rule.rule_init_function: self.rule.rule_init_function(self.params) self.job.run(name=self.rule.name, input=job_blobs, map=self.rule.map_function, reduce=self.rule.reduce_function, params=self.params, partitions=self.rule.partitions, map_input_stream=self.rule.map_input_stream, map_output_stream=self.rule.map_output_stream, map_init=self.rule.map_init_function, save=self.rule.save or self.rule.result_tag is not None, scheduler=self.rule.scheduler, combiner=self.rule.combiner_function, reduce_output_stream=self.rule.reduce_output_stream, sort=self.rule.sort, sort_buffer_size=self.rule.sort_buffer_size, profile=self.settings.get('profile'), partition=self.rule.partition_function, required_files=self.rule.required_files, required_modules=self.rule.required_modules) # actual id is only assigned after starting the job self.full_job_id = self.job.name return self.job return None def query(self): log.info("Query information:") pprint.pprint({'source query': self.archiver.tags, 'tag results': self.archiver.tag_map, 'total_blobs': self.archiver.blob_count}) def _safe_str(self, value): try: return str(value) except UnicodeEncodeError: return unicode(value).encode('utf-8') def wait(self): blob_count = self.archiver.blob_count log.info('Started job %s processing %i blobs', self.job.name, blob_count) self._notify(JOB_WAIT) try: jobout = self.job.wait() log.info('Done waiting for job %s', self.job.name) self._profile(self.job) self._tag_results(self.job.name) if not self.settings.get('debug'): try_to_execute(partial(self._process_results, jobout, self.job.name)) else: results = self._get_job_results(jobout) reduce_result(results) self._purge(self._safe_str(self.job.name)) except Exception as e: log.error('Job %s failed with %s', self.job.name, e.message) self._notify(JOB_ERROR) if self.rule.notify_on_fail: try: from inferno.lib.notifications import send_mail send_mail(job_id=self.job.name, job_fail=e, mail_to=self.rule.notify_addresses, mail_from=self.settings.get('mail_from'), mail_server=self.settings.get('mail_server')) except Exception as e: log.error('Job %s failed notification: %s', self.job.name, e, exc_info=sys.exc_info()) raise else: if not self.settings.get('debug'): try_to_execute(partial(self._archive_tags, self.archiver)) if self.rule.rule_cleanup: self._notify(JOB_CLEANUP) self.rule.rule_cleanup(self, ) self._notify(JOB_DONE) if self.rule.notify_on_success: try: from inferno.lib.notifications import send_mail msg = "Job %s finished successfully." % self.job.name send_mail(job_id=self.job.name, job_fail=msg, mail_to=self.rule.notify_addresses, mail_from=self.settings.get('mail_from'), mail_server=self.settings.get('mail_server')) except Exception as e: log.error('Job %s failed notification: %s', self.job.name, e, exc_info=sys.exc_info()) log.info('Finished job %s', self.job.name) def _determine_job_blobs(self): self._notify(JOB_BLOBS) tags = self.job_options.tags urls = self.job_options.urls + self.urls if self.urls else self.job_options.urls if tags or urls: log.info('Processing input: %s...', (tags + urls)[:1000]) else: log.info('No input available for %s.' % self.rule.name) archiver = Archiver( ddfs=self.ddfs, archive_prefix=self.rule.archive_tag_prefix, archive_mode=self.rule.archive, max_blobs=self.rule.max_blobs, tags=tags, urls=urls, newest_first=self.rule.newest_first, ) return archiver def _get_job_results(self, jobout): if self.rule.result_iterator: self._notify(JOB_RESULTS) return self.rule.result_iterator(jobout) def _profile(self, job): if self.settings.get('profile'): self._notify(JOB_PROFILE) job.profile_stats().sort_stats('cumulative').print_stats() def _tag_results(self, job_name): if self.job_options.result_tag: self._notify(JOB_TAG) result_name = 'disco:job:results:%s' % job_name suffix = job_name # try to guess a better suffix (ie. the date) # sort the tags the job ran on, take the last part of the last tag # if that looks like a date, use it, otherwise use the job name if self.rule.result_tag_suffix: if str(self.rule.result_tag_suffix).lower() == "date": suffix = str(datetime.now().date()) else: if len(self.job_options.tags): tags = sorted(self.job_options.tags) date = (tags[-1].split(':'))[-1] if len(date) == 10 and '-' in date: suffix = date tag_name = '%s:%s' % (self.job_options.result_tag, suffix) log.info('Tagging result: %s', tag_name) try: try_to_execute(partial(self.ddfs.tag, tag_name, list(self.ddfs.blobs(result_name)))) except Exception: log.error('Error tagging result %s', tag_name) raise def _process_results(self, jobout, job_id): if self.rule.result_processor: self._notify(JOB_PROCESS) results = self._get_job_results(jobout) self.rule.result_processor( results, params=self.params, job_id=job_id) def _purge(self, job_name): if not self.settings.get('no_purge'): self._notify(JOB_PURGE) self.disco.purge(job_name) def _archive_tags(self, archiver): if archiver.archive_mode: self._notify(JOB_ARCHIVE) archiver.archive() def _notify(self, stage): # if we are daemon spawn, tell mommy where we are if self.full_job_id: log.info("Worker: %s stage= %s " % (self.full_job_id, stage)) def _enough_blobs(self, blob_count): # Note that argument blob_count is the total number of tag blobs and urls. # To take urls into account, if no tag specified but urls are available, # let it run if len(self.job_options.tags) == 0: if blob_count: return True else: log.info('Skipping job %s: %d blobs required, has %d', self.rule.name, self.rule.min_blobs, blob_count) return False if not blob_count or (blob_count < self.rule.min_blobs and not self.settings.get('force')): log.info('Skipping job %s: %d blobs required, has %d', self.rule.name, self.rule.min_blobs, blob_count) return False return True def __str__(self): return '<InfernoJob for: %s>' % self.rule.name
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator try: n_clusters = int(n_clusters) max_iterations = int(max_iterations) if n_clusters < 2: raise Exception("Parameter n_clusters should be greater than 1.") if max_iterations < 1: raise Exception("Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ( "split", Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map), ), ("group_label", Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True)), ] job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["seed"] = random_state job.params["k"] = n_clusters job.run(input=dataset.params["data_tag"], name="kmeans_init") init = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(init)] for j in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["k"] = n_clusters job.params["centers"] = centers job.pipeline = [ ( "split", Stage( "kmeans_map_iter_%s" % (j + 1,), input_chain=dataset.params["input_chain"], process=estimate_map, init=simple_init, ), ), ( "group_label", Stage("kmeans_reduce_iter_%s" % (j + 1,), process=estimate_reduce, init=simple_init, combine=True), ), ] job.run(input=dataset.params["data_tag"], name="kmeans_iter_%d" % (j + 1,)) fitmodel_url = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(fitmodel_url)] return {"kmeans_fitmodel": fitmodel_url} # return results url