def _run_task(self, task_id): task = self.__scheduled_tasks[task_id] logger.info('[pid %s] Running %s', os.getpid(), task_id) try: # Verify that all the tasks are fulfilled! ok = True for task_2 in task.deps(): if not task_2.complete(): ok = False missing_dep = task_2 if not ok: # TODO: possibly try to re-add task again ad pending raise RuntimeError('Unfulfilled dependency %r at run time!\nPrevious tasks: %r' % (missing_dep.task_id, self._previous_tasks)) task.run() expl = json.dumps(task.on_success()) logger.info('[pid %s] Done %s', os.getpid(), task_id) status = DONE except KeyboardInterrupt: raise except Exception as ex: status = FAILED logger.exception("[pid %s] Error while running %s" % (os.getpid(), task)) expl = task.on_failure(ex) receiver = interface.get_config().get('core', 'error-email', None) sender = interface.get_config().get('core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL) logger.info("[pid %s] Sending error email to %r", os.getpid(), receiver) notifications.send_email("Luigi: %s FAILED" % task, expl, sender, (receiver,)) self.__scheduler.add_task(self.__id, task_id, status=status, expl=expl, runnable=None)
def add(self, task): try: task_id = task.task_id if task_id in self.__scheduled_tasks: return # already scheduled logger.debug("Checking if %s is complete" % task_id) is_complete = False try: is_complete = task.complete() if is_complete not in (True, False): raise Exception("Return value of Task.complete() must be boolean (was %r)" % is_complete) except KeyboardInterrupt: raise except: msg = "Will not schedule %s or any dependencies due to error in complete() method:" % (task,) logger.warning(msg, exc_info=1) # like logger.exception but with WARNING level receiver = interface.get_config().get('core', 'error-email', None) sender = interface.get_config().get('core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL) logger.info("Sending warning email to %r" % receiver) notifications.send_email( subject="Luigi: %s failed scheduling" % (task,), message="%s:\n%s" % (msg, traceback.format_exc()), sender=sender, recipients=(receiver,)) return # abort, i.e. don't schedule any subtasks of a task with # failing complete()-method since we don't know if the task # is complete and subtasks might not be desirable to run if # they have already ran before if is_complete: # Not submitting dependencies of finished tasks self.__scheduler.add_task(self.__id, task_id, status=DONE, runnable=False) elif task.run == NotImplemented: self.__scheduled_tasks[task_id] = task self.__scheduler.add_task(self.__id, task_id, status=PENDING, runnable=False) logger.warning('Task %s is is not complete and run() is not implemented. Probably a missing external dependency.', task_id) else: self.__scheduled_tasks[task_id] = task deps = [d.task_id for d in task.deps()] self.__scheduler.add_task(self.__id, task_id, status=PENDING, deps=deps, runnable=True) logger.info('Scheduled %s' % task_id) for task_2 in task.deps(): self.add(task_2) # Schedule stuff recursively except KeyboardInterrupt: raise except: logger.exception("Luigi unexpected framework error while scheduling %s" % task) receiver = interface.get_config().get('core', 'error-email', None) sender = interface.get_config().get('core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL) notifications.send_email( subject="Luigi: Framework error while scheduling %s" % (task,), message="Luigi framework error:\n%s" % traceback.format_exc(), recipients=(receiver,), sender=sender)
def add(self, task): try: task_id = task.task_id if task_id in self.__scheduled_tasks: return # already scheduled logger.debug("Checking %s" % task_id) if task.complete(): # Not submitting dependencies of finished tasks self.__scheduler.add_task(self.__id, task_id, status=DONE, runnable=False) elif task.run == NotImplemented: self.__scheduled_tasks[task_id] = task self.__scheduler.add_task(self.__id, task_id, status=PENDING, runnable=False) logger.warning( 'Task %s is is not complete and run() is not implemented. Probably a missing external dependency.', task_id) else: self.__scheduled_tasks[task_id] = task deps = [d.task_id for d in task.deps()] self.__scheduler.add_task(self.__id, task_id, status=PENDING, deps=deps, runnable=True) logger.info('Scheduled %s' % task_id) for task_2 in task.deps(): self.add(task_2) # Schedule stuff recursively except KeyboardInterrupt: raise except: logger.exception("Error while trying to schedule %s" % task) if not sys.stdout.isatty(): receiver = interface.get_config().get('core', 'error-email', None) if receiver is not None: email_body = "Scheduling error:\n%s" % traceback.format_exc( ) sender = interface.get_config().get( 'core', 'email-sender', 'luigi-client@%s' % socket.getfqdn()) logger.info("Sending error email to %r" % receiver) send_email("Luigi: %s FAILED SCHEDULING" % task, email_body, sender, (receiver, )) exit( 1) # can't allow task to run without its dependencies resolved
def _run_task(self, task_id): task = self.__scheduled_tasks[task_id] logger.info('[pid %s] Running %s', os.getpid(), task_id) try: # Verify that all the tasks are fulfilled! ok = True for task_2 in task.deps(): if not task_2.complete(): ok = False missing_dep = task_2 if not ok: # TODO: possibly tru to re-add task again ad pending raise RuntimeError( 'Unfulfilled dependency %r at run time!\nPrevious tasks: %r' % (missing_dep.task_id, self._previous_tasks)) task.run() expl = json.dumps(task.on_success()) logger.info('[pid %s] Done %s', os.getpid(), task_id) status = DONE except KeyboardInterrupt: raise except Exception as ex: status = FAILED logger.exception("[pid %s] Error while running %s" % (os.getpid(), task)) expl = task.on_failure(ex) if not sys.stdout.isatty(): receiver = interface.get_config().get('core', 'error-email', None) if receiver is not None: sender = interface.get_config().get( 'core', 'email-sender', 'luigi-client@%s' % socket.getfqdn()) logger.info("[pid %s] Sending error email to %r", os.getpid(), receiver) send_email("Luigi: %s FAILED" % task, expl, sender, (receiver, )) self.__scheduler.add_task(self.__id, task_id, status=status, expl=expl, runnable=None)
def _create_scheduler(): config = interface.get_config() retry_delay = config.getfloat('scheduler', 'retry-delay', 900.0) remove_delay = config.getfloat('scheduler', 'remove-delay', 600.0) worker_disconnect_delay = config.getfloat('scheduler', 'worker-disconnect-delay', 60.0) return scheduler.CentralPlannerScheduler(retry_delay, remove_delay, worker_disconnect_delay)
def use_cdh4_syntax(): """ CDH4 (hadoop 2+) has a slightly different syntax for interacting with hdfs via the command line. The default version is CDH4, but one can override this setting with "cdh3" in the hadoop section of the config in order to use the old syntax """ import interface return interface.get_config().get("hadoop", "version", "cdh4").lower() == "cdh4"
def add(self, task): try: task_id = task.task_id if task_id in self.__scheduled_tasks: return # already scheduled logger.debug("Checking %s" % task_id) if task.complete(): # Not submitting dependencies of finished tasks self.__scheduler.add_task(self.__id, task_id, status=DONE, runnable=False) elif task.run == NotImplemented: self.__scheduled_tasks[task_id] = task self.__scheduler.add_task(self.__id, task_id, status=PENDING, runnable=False) logger.warning( "Task %s is is not complete and run() is not implemented. Probably a missing external dependency.", task_id, ) else: self.__scheduled_tasks[task_id] = task deps = [d.task_id for d in task.deps()] self.__scheduler.add_task(self.__id, task_id, status=PENDING, deps=deps, runnable=True) logger.info("Scheduled %s" % task_id) for task_2 in task.deps(): self.add(task_2) # Schedule stuff recursively except KeyboardInterrupt: raise except: expl = traceback.format_exc(sys.exc_info()[2]) logger.error(expl) logger.error("Error while trying to schedule %s" % task) if not sys.stdout.isatty(): receiver = interface.get_config().get("core", "error-email", None) if receiver is not None: sender = interface.get_config().get("core", "email-sender", "luigi-client@%s" % socket.getfqdn()) logger.info("Sending error email to %r" % receiver) send_email("Luigi: %s FAILED SCHEDULING" % task, expl, sender, (receiver,)) exit(1) # can't allow task to run without its dependencies resolved
def _run_task(self, task_id): task = self.__scheduled_tasks[task_id] logger.info("[pid %s] Running %s", os.getpid(), task_id) try: # Verify that all the tasks are fulfilled! ok = True for task_2 in task.deps(): if not task_2.complete(): ok = False missing_dep = task_2 if not ok: # TODO: possibly tru to re-add task again ad pending raise RuntimeError( "Unfulfilled dependency %r at run time!\nPrevious tasks: %r" % (missing_dep.task_id, self._previous_tasks) ) task.run() expl = json.dumps(task.on_success()) logger.info("[pid %s] Done %s", os.getpid(), task_id) status = DONE except KeyboardInterrupt: raise except Exception as ex: status = FAILED expl = json.dumps(task.on_failure(ex, traceback.format_exc(sys.exc_info()[2]))) logger.error(expl) logger.exception("[pid %s] Error while running %s" % (os.getpid(), task)) if not sys.stdout.isatty(): receiver = interface.get_config().get("core", "error-email", None) if receiver is not None: sender = interface.get_config().get("core", "email-sender", "luigi-client@%s" % socket.getfqdn()) logger.info("[pid %s] Sending error email to %r", os.getpid(), receiver) send_email("Luigi: %s FAILED" % task, expl, sender, (receiver,)) self.__scheduler.add_task(self.__id, task_id, status=status, expl=expl, runnable=None)
def get_whoops_defaults(config=None): """Reads defaults from a client configuration file and fails if not.""" config = config or interface.get_config() try: return { "host": config.get("hdfs", "namenode_host"), "port": config.get("hdfs", "namenode_port") } except: raise RuntimeError("You must specify namenode_host and namenode_port " "in the [hdfs] section of your luigi config in " "order to use luigi's whoops support without a " "fully-qualified url")
def runSimulator(input_file): env = simpy.Environment() debug = False hosts, links, flows, routers = interface.get_config(env, input_file, debug) monitor = Monitor(env, links, flows) # Run the simulation env.run(10 * 1000) # Graph the results # show_results(monitor) # Export the resutls to output.xlsx export_results(monitor)
def __init__(self): import interface config = interface.get_config() streaming_jar = config.get('hadoop', 'streaming-jar') super(DefaultHadoopJobRunner, self).__init__(streaming_jar=streaming_jar)
def run_job(self, job): packages = [ luigi ] + self.modules + job.extra_modules() + list(_attached_packages) # find the module containing the job packages.append(__import__(job.__module__, None, None, 'dummy')) # find the path to out runner.py runner_path = mrrunner.__file__ # assume source is next to compiled if runner_path.endswith("pyc"): runner_path = runner_path[:-3] + "py" base_tmp_dir = interface.get_config().get('core', 'tmp-dir', '/tmp/luigi') self.tmp_dir = os.path.join( base_tmp_dir, 'hadoop_job_%016x' % random.getrandbits(64)) logger.debug("Tmp dir: %s", self.tmp_dir) os.makedirs(self.tmp_dir) # build arguments map_cmd = 'python mrrunner.py map' cmb_cmd = 'python mrrunner.py combiner' red_cmd = 'python mrrunner.py reduce' # replace output with a temporary work directory output_final = job.output().path output_tmp_fn = output_final + '-temp-' + datetime.datetime.now( ).isoformat().replace(':', '-') tmp_target = luigi.hdfs.HdfsTarget(output_tmp_fn, is_tmp=True) arglist = ['hadoop', 'jar', self.streaming_jar] # 'libjars' is a generic option, so place it first libjars = [libjar for libjar in self.libjars] for libjar in self.libjars_in_hdfs: subprocess.call(['hadoop', 'fs', '-get', libjar, self.tmp_dir]) libjars.append(os.path.join(self.tmp_dir, os.path.basename(libjar))) if libjars: arglist += ['-libjars', ','.join(libjars)] # Add static files and directories extra_files = get_extra_files(job.extra_files()) files = [] for src, dst in extra_files: dst_tmp = '%s_%09d' % (dst.replace( '/', '_'), random.randint(0, 999999999)) files += ['%s#%s' % (src, dst_tmp)] # -files doesn't support subdirectories, so we need to create the dst_tmp -> dst manually job._add_link(dst_tmp, dst) if files: arglist += ['-files', ','.join(files)] jobconfs = job.jobconfs() for k, v in self.jobconfs.iteritems(): jobconfs.append('%s=%s' % (k, v)) for conf in jobconfs: arglist += ['-D', conf] arglist += self.streaming_args arglist += ['-mapper', map_cmd, '-reducer', red_cmd] if job.combiner != NotImplemented: arglist += ['-combiner', cmb_cmd] files = [ runner_path, self.tmp_dir + '/packages.tar', self.tmp_dir + '/job-instance.pickle' ] for f in files: arglist += ['-file', f] if self.output_format: arglist += ['-outputformat', self.output_format] if self.input_format: arglist += ['-inputformat', self.input_format] for target in luigi.task.flatten(job.input_hadoop()): assert isinstance(target, luigi.hdfs.HdfsTarget) arglist += ['-input', target.path] assert isinstance(job.output(), luigi.hdfs.HdfsTarget) arglist += ['-output', output_tmp_fn] # submit job create_packages_archive(packages, self.tmp_dir + '/packages.tar') job._dump(self.tmp_dir) run_and_track_hadoop_job(arglist) # rename temporary work directory to given output tmp_target.move(output_final, fail_if_exists=True) self.finish()
def add(self, task): if not isinstance(task, Task): raise TaskException('Can not schedule non-task %s' % task) if not task.initialized(): # we can't get the repr of it since it's not initialized... raise TaskException( 'Task of class %s not initialized. Did you override __init__ and forget to call super(...).__init__?' % task.__class__.__name__) try: task_id = task.task_id if task_id in self.__scheduled_tasks: return # already scheduled logger.debug("Checking if %s is complete" % task_id) is_complete = False try: is_complete = task.complete() if is_complete not in (True, False): raise Exception( "Return value of Task.complete() must be boolean (was %r)" % is_complete) except KeyboardInterrupt: raise except: msg = "Will not schedule %s or any dependencies due to error in complete() method:" % ( task, ) logger.warning( msg, exc_info=1) # like logger.exception but with WARNING level receiver = interface.get_config().get('core', 'error-email', None) sender = interface.get_config().get( 'core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL) logger.info("Sending warning email to %r" % receiver) notifications.send_email( subject="Luigi: %s failed scheduling" % (task, ), message="%s:\n%s" % (msg, traceback.format_exc()), sender=sender, recipients=(receiver, )) return # abort, i.e. don't schedule any subtasks of a task with # failing complete()-method since we don't know if the task # is complete and subtasks might not be desirable to run if # they have already ran before if is_complete: # Not submitting dependencies of finished tasks self.__scheduler.add_task(self.__id, task_id, status=DONE, runnable=False) elif task.run == NotImplemented: self.__scheduled_tasks[task_id] = task self.__scheduler.add_task(self.__id, task_id, status=PENDING, runnable=False) logger.warning( 'Task %s is not complete and run() is not implemented. Probably a missing external dependency.', task_id) else: self.__scheduled_tasks[task_id] = task deps = task.deps() for d in deps: if isinstance(d, Target): raise Exception( 'requires() can not return Target objects. Wrap it in an ExternalTask class' ) elif not isinstance(d, Task): raise Exception('requires() must return Task objects') deps = [d.task_id for d in task.deps()] self.__scheduler.add_task(self.__id, task_id, status=PENDING, deps=deps, runnable=True) logger.info('Scheduled %s' % task_id) for task_2 in task.deps(): self.add(task_2) # Schedule stuff recursively except KeyboardInterrupt: raise except: logger.exception( "Luigi unexpected framework error while scheduling %s" % task) receiver = interface.get_config().get('core', 'error-email', None) sender = interface.get_config().get( 'core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL) notifications.send_email( subject="Luigi: Framework error while scheduling %s" % (task, ), message="Luigi framework error:\n%s" % traceback.format_exc(), recipients=(receiver, ), sender=sender)
def run_job(self, job): packages = [luigi] + self.modules + job.extra_modules() + list(_attached_packages) # find the module containing the job packages.append(__import__(job.__module__, None, None, 'dummy')) # find the path to out runner.py runner_path = mrrunner.__file__ # assume source is next to compiled if runner_path.endswith("pyc"): runner_path = runner_path[:-3] + "py" base_tmp_dir = interface.get_config().get('core', 'tmp-dir', '/tmp/luigi') self.tmp_dir = os.path.join(base_tmp_dir, 'hadoop_job_%016x' % random.getrandbits(64)) logger.debug("Tmp dir: %s", self.tmp_dir) os.makedirs(self.tmp_dir) # build arguments map_cmd = 'python mrrunner.py map' cmb_cmd = 'python mrrunner.py combiner' red_cmd = 'python mrrunner.py reduce' # replace output with a temporary work directory output_final = job.output().path output_tmp_fn = output_final + '-temp-' + datetime.datetime.now().isoformat().replace(':', '-') tmp_target = luigi.hdfs.HdfsTarget(output_tmp_fn, is_tmp=True) arglist = ['hadoop', 'jar', self.streaming_jar] # 'libjars' is a generic option, so place it first libjars = [libjar for libjar in self.libjars] for libjar in self.libjars_in_hdfs: subprocess.call(['hadoop', 'fs', '-get', libjar, self.tmp_dir]) libjars.append(os.path.join(self.tmp_dir, os.path.basename(libjar))) if libjars: arglist += ['-libjars', ','.join(libjars)] # Add static files and directories extra_files = get_extra_files(job.extra_files()) files = [] for src, dst in extra_files: dst_tmp = '%s_%09d' % (dst.replace('/', '_'), random.randint(0, 999999999)) files += ['%s#%s' % (src, dst_tmp)] # -files doesn't support subdirectories, so we need to create the dst_tmp -> dst manually job._add_link(dst_tmp, dst) if files: arglist += ['-files', ','.join(files)] jobconfs = job.jobconfs() for k, v in self.jobconfs.iteritems(): jobconfs.append('%s=%s' % (k, v)) for conf in jobconfs: arglist += ['-D', conf] arglist += self.streaming_args arglist += ['-mapper', map_cmd, '-reducer', red_cmd] if job.combiner != NotImplemented: arglist += ['-combiner', cmb_cmd] files = [runner_path, self.tmp_dir + '/packages.tar', self.tmp_dir + '/job-instance.pickle'] for f in files: arglist += ['-file', f] if self.output_format: arglist += ['-outputformat', self.output_format] if self.input_format: arglist += ['-inputformat', self.input_format] for target in luigi.task.flatten(job.input_hadoop()): assert isinstance(target, luigi.hdfs.HdfsTarget) arglist += ['-input', target.path] assert isinstance(job.output(), luigi.hdfs.HdfsTarget) arglist += ['-output', output_tmp_fn] # submit job create_packages_archive(packages, self.tmp_dir + '/packages.tar') job._dump(self.tmp_dir) self.run_and_track_hadoop_job(arglist) # rename temporary work directory to given output tmp_target.move(output_final, fail_if_exists=True) self.finish()