def pre_process_data(ns, tf, log_details): """ For convenience, perform operations on the input stream before passing along to other data processing code """ if ns.sample: log.info('sampling a percentage of input data without replacement', extra=dict(sampling_pct=ns.sample, **log_details)) tf = tf.sample(False, ns.sample, 0) if ns.mapjson: log.info('converting all elements in data stream to json') tf = tf.map(simplejson.loads) return tf
def apply_data_transform(ns, sc, log_details, pjob_id, module): """Pass control to the module.main method. If module.main specifies a `textFile` parameter, pass the textFile instance. Otherwise, just map module.main on the RDD """ func_args = inspect.getargspec(module.main).args if 'sc' in func_args: log.info( 'passing spark context to a module.main function', extra=log_details) try: module.main(sc=sc, ns=ns, **pjob_id) except Exception as err: log_and_raise( "Job failed with error: %s" % err, log_details) else: read_fp = format_fp(ns.read_fp, ns, pjob_id) log_details = dict(read_fp=read_fp, **log_details) tf = sc.textFile(read_fp, ns.minPartitions) tf = pre_process_data(ns=ns, tf=tf, log_details=log_details) if 'textFile' in func_args: log.info( 'passing textFile instance to a module.main function', extra=log_details) try: module.main(textFile=tf, ns=ns, **pjob_id) except Exception as err: log_and_raise( "Job failed with error: %s" % err, log_details) else: write_fp = format_fp(ns.write_fp, ns, pjob_id) log.info( 'mapping a module.main function to all elements in a textFile' ' and writing output', extra=dict(write_fp=write_fp, **log_details)) try: ( tf .map(functools.partial(module.main, ns=ns, **pjob_id)) .saveAsTextFile(write_fp) ) except Exception as err: log_and_raise(err, log_details)
def main(ns): """ A generic plugin that schedules arbitrary bash jobs using Stolos Assume code is written in Python. For Scala or R code, use another option. """ job_id = ns.job_id ld = dict(app_name=ns.app_name, job_id=ns.job_id) log.info('Running bash job', extra=ld) cmd = get_bash_cmd(ns.app_name) if ns.bash_cmd: cmd += ' '.join(ns.bash_cmd) log.debug("Appending user-supplied bash options to defaults", extra=dict(app_name=ns.app_name, job_id=job_id, cmd=cmd)) ld.update(cmd=cmd) if not cmd: raise UserWarning( "You need to specify bash options or configure default bash" " options") _cmdargs = dict(**ns.__dict__) _cmdargs.update(api.parse_job_id(ns.app_name, job_id)) cmd = cmd.format(**_cmdargs) if ns.redirect_to_stderr: _std = sys.stderr else: _std = PIPE log.info('running command', extra=ld) returncode, stdout, stderr = run(cmd, shell=True, timeout=ns.watch, stdout=_std, stderr=_std) ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld) if returncode == -9: log_and_raise("Bash job timed out", ld) elif returncode != 0: # this raises an error and logs output: log_and_raise("Bash job failed", ld) else: log.info("Bash job succeeded", extra=ld)
def main(ns): """ A generic plugin that schedules arbitrary bash jobs using Stolos Assume code is written in Python. For Scala or R code, use another option. """ job_id = ns.job_id ld = dict(app_name=ns.app_name, job_id=ns.job_id) log.info('Running bash job', extra=ld) cmd = get_bash_cmd(ns.app_name) if ns.bash_cmd: cmd += ' '.join(ns.bash_cmd) log.debug( "Appending user-supplied bash options to defaults", extra=dict( app_name=ns.app_name, job_id=job_id, cmd=cmd)) ld.update(cmd=cmd) if not cmd: raise UserWarning( "You need to specify bash options or configure default bash" " options") _cmdargs = dict(**ns.__dict__) _cmdargs.update(api.parse_job_id(ns.app_name, job_id)) cmd = cmd.format(**_cmdargs) if ns.redirect_to_stderr: _std = sys.stderr else: _std = PIPE log.info('running command', extra=ld) returncode, stdout, stderr = run( cmd, shell=True, timeout=ns.watch, stdout=_std, stderr=_std) ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld) if returncode == -9: log_and_raise("Bash job timed out", ld) elif returncode != 0: # this raises an error and logs output: log_and_raise("Bash job failed", ld) else: log.info("Bash job succeeded", extra=ld)