def validate_uris(app_name, uris): key = 'uris' msg = ("pyspark app misconfigured:" " %s, if supplied, must be a list of hadoop-compatible filepaths" ) % key if not all(isinstance(x, six.string_types) for x in uris): log_and_raise(msg, extra=dict(app_name=app_name))
def get_bash_cmd(app_name): """Lookup the bash command-line options for a bash task If they don't exist, return empty string""" dg = api.get_tasks_config() meta = dg[app_name] job_type = meta.get('job_type', 'bash') try: assert job_type == 'bash' except AssertionError: log.error("App is not a bash job", extra=dict(app_name=app_name, job_type=job_type)) rv = meta.get('bash_cmd', '') if not isinstance(rv, six.string_types): log_and_raise( "App config for bash plugin is misconfigured:" " bash_cmd is not a string", dict(app_name=app_name)) return rv
def get_bash_cmd(app_name): """Lookup the bash command-line options for a bash task If they don't exist, return empty string""" dg = api.get_tasks_config() meta = dg[app_name] job_type = meta.get('job_type', 'bash') try: assert job_type == 'bash' except AssertionError: log.error( "App is not a bash job", extra=dict( app_name=app_name, job_type=job_type)) rv = meta.get('bash_cmd', '') if not isinstance(rv, six.string_types): log_and_raise( "App config for bash plugin is misconfigured:" " bash_cmd is not a string", dict(app_name=app_name)) return rv
def main(ns): """ A generic plugin that schedules arbitrary bash jobs using Stolos Assume code is written in Python. For Scala or R code, use another option. """ job_id = ns.job_id ld = dict(app_name=ns.app_name, job_id=ns.job_id) log.info('Running bash job', extra=ld) cmd = get_bash_cmd(ns.app_name) if ns.bash_cmd: cmd += ' '.join(ns.bash_cmd) log.debug("Appending user-supplied bash options to defaults", extra=dict(app_name=ns.app_name, job_id=job_id, cmd=cmd)) ld.update(cmd=cmd) if not cmd: raise UserWarning( "You need to specify bash options or configure default bash" " options") _cmdargs = dict(**ns.__dict__) _cmdargs.update(api.parse_job_id(ns.app_name, job_id)) cmd = cmd.format(**_cmdargs) if ns.redirect_to_stderr: _std = sys.stderr else: _std = PIPE log.info('running command', extra=ld) returncode, stdout, stderr = run(cmd, shell=True, timeout=ns.watch, stdout=_std, stderr=_std) ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld) if returncode == -9: log_and_raise("Bash job timed out", ld) elif returncode != 0: # this raises an error and logs output: log_and_raise("Bash job failed", ld) else: log.info("Bash job succeeded", extra=ld)
def main(ns): """ A generic plugin that schedules arbitrary bash jobs using Stolos Assume code is written in Python. For Scala or R code, use another option. """ job_id = ns.job_id ld = dict(app_name=ns.app_name, job_id=ns.job_id) log.info('Running bash job', extra=ld) cmd = get_bash_cmd(ns.app_name) if ns.bash_cmd: cmd += ' '.join(ns.bash_cmd) log.debug( "Appending user-supplied bash options to defaults", extra=dict( app_name=ns.app_name, job_id=job_id, cmd=cmd)) ld.update(cmd=cmd) if not cmd: raise UserWarning( "You need to specify bash options or configure default bash" " options") _cmdargs = dict(**ns.__dict__) _cmdargs.update(api.parse_job_id(ns.app_name, job_id)) cmd = cmd.format(**_cmdargs) if ns.redirect_to_stderr: _std = sys.stderr else: _std = PIPE log.info('running command', extra=ld) returncode, stdout, stderr = run( cmd, shell=True, timeout=ns.watch, stdout=_std, stderr=_std) ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld) if returncode == -9: log_and_raise("Bash job timed out", ld) elif returncode != 0: # this raises an error and logs output: log_and_raise("Bash job failed", ld) else: log.info("Bash job succeeded", extra=ld)
def validate_env(app_name, env): if not hasattr(env, 'items'): log_and_raise(("pyspark app misconfigured:" " env, if supplied, must be a key: value mapping"), dict(app_name=app_name)) for k, v in env.items(): if not isinstance(k, six.string_types): log_and_raise(("pyspark app misconfigured:" "invalid key. expected string"), dict(app_name=app_name, key=k)) if not isinstance(v, six.string_types): log_and_raise(("pyspark app misconfigured:" "invalid value. expected string"), dict(app_name=app_name, value=v))
def validate_env(app_name, env): if not hasattr(env, 'items'): log_and_raise( ("pyspark app misconfigured:" " env, if supplied, must be a key: value mapping"), dict(app_name=app_name)) for k, v in env.items(): if not isinstance(k, six.string_types): log_and_raise( ("pyspark app misconfigured:" "invalid key. expected string"), dict(app_name=app_name, key=k)) if not isinstance(v, six.string_types): log_and_raise( ("pyspark app misconfigured:" "invalid value. expected string"), dict(app_name=app_name, value=v))
def apply_data_transform(ns, sc, log_details, pjob_id, module): """Pass control to the module.main method. If module.main specifies a `textFile` parameter, pass the textFile instance. Otherwise, just map module.main on the RDD """ func_args = inspect.getargspec(module.main).args if 'sc' in func_args: log.info( 'passing spark context to a module.main function', extra=log_details) try: module.main(sc=sc, ns=ns, **pjob_id) except Exception as err: log_and_raise( "Job failed with error: %s" % err, log_details) else: read_fp = format_fp(ns.read_fp, ns, pjob_id) log_details = dict(read_fp=read_fp, **log_details) tf = sc.textFile(read_fp, ns.minPartitions) tf = pre_process_data(ns=ns, tf=tf, log_details=log_details) if 'textFile' in func_args: log.info( 'passing textFile instance to a module.main function', extra=log_details) try: module.main(textFile=tf, ns=ns, **pjob_id) except Exception as err: log_and_raise( "Job failed with error: %s" % err, log_details) else: write_fp = format_fp(ns.write_fp, ns, pjob_id) log.info( 'mapping a module.main function to all elements in a textFile' ' and writing output', extra=dict(write_fp=write_fp, **log_details)) try: ( tf .map(functools.partial(module.main, ns=ns, **pjob_id)) .saveAsTextFile(write_fp) ) except Exception as err: log_and_raise(err, log_details)
def validate_spark_conf(app_name, conf): # spark_conf - Is it a dict of str: str pairs? if not isinstance(conf, (dict, TasksConfigBaseMapping)): log_and_raise(("pyspark app improperly configured:" " spark_conf must be a key:value mapping."), dict(app_name=app_name)) for k, v in conf.items(): if not isinstance(k, six.string_types): log_and_raise( "pyspark app improperly configured:" " Key in spark_conf must be a string", dict(app_name=app_name, key=k, key_type=type(k))) if not isinstance(v, six.string_types + six.integer_types + (bool, float)): log_and_raise(("pyspark app improperly configured:" "Value for given key in spark_conf must be an" " int, string or bool"), dict(key=k, value_type=type(v), app_name=app_name))
def validate_spark_conf(app_name, conf): # spark_conf - Is it a dict of str: str pairs? if not isinstance(conf, (dict, TasksConfigBaseMapping)): log_and_raise( ("pyspark app improperly configured:" " spark_conf must be a key:value mapping."), dict(app_name=app_name)) for k, v in conf.items(): if not isinstance(k, six.string_types): log_and_raise( "pyspark app improperly configured:" " Key in spark_conf must be a string", dict(app_name=app_name, key=k, key_type=type(k))) if not isinstance(v, six.string_types + six.integer_types + (bool, float)): log_and_raise( ("pyspark app improperly configured:" "Value for given key in spark_conf must be an" " int, string or bool"), dict(key=k, value_type=type(v), app_name=app_name))