Beispiel #1
0
def validate_uris(app_name, uris):
    key = 'uris'
    msg = ("pyspark app misconfigured:"
           " %s, if supplied, must be a list of hadoop-compatible filepaths"
           ) % key
    if not all(isinstance(x, six.string_types) for x in uris):
        log_and_raise(msg, extra=dict(app_name=app_name))
Beispiel #2
0
def validate_uris(app_name, uris):
    key = 'uris'
    msg = ("pyspark app misconfigured:"
           " %s, if supplied, must be a list of hadoop-compatible filepaths"
           ) % key
    if not all(isinstance(x, six.string_types) for x in uris):
        log_and_raise(msg, extra=dict(app_name=app_name))
Beispiel #3
0
def get_bash_cmd(app_name):
    """Lookup the bash command-line options for a bash task
    If they don't exist, return empty string"""
    dg = api.get_tasks_config()
    meta = dg[app_name]
    job_type = meta.get('job_type', 'bash')
    try:
        assert job_type == 'bash'
    except AssertionError:
        log.error("App is not a bash job",
                  extra=dict(app_name=app_name, job_type=job_type))
    rv = meta.get('bash_cmd', '')
    if not isinstance(rv, six.string_types):
        log_and_raise(
            "App config for bash plugin is misconfigured:"
            " bash_cmd is not a string", dict(app_name=app_name))
    return rv
Beispiel #4
0
def get_bash_cmd(app_name):
    """Lookup the bash command-line options for a bash task
    If they don't exist, return empty string"""
    dg = api.get_tasks_config()
    meta = dg[app_name]
    job_type = meta.get('job_type', 'bash')
    try:
        assert job_type == 'bash'
    except AssertionError:
        log.error(
            "App is not a bash job", extra=dict(
                app_name=app_name, job_type=job_type))
    rv = meta.get('bash_cmd', '')
    if not isinstance(rv, six.string_types):
        log_and_raise(
            "App config for bash plugin is misconfigured:"
            " bash_cmd is not a string", dict(app_name=app_name))
    return rv
Beispiel #5
0
def main(ns):
    """
    A generic plugin that schedules arbitrary bash jobs using Stolos

    Assume code is written in Python.  For Scala or R code, use another option.
    """
    job_id = ns.job_id
    ld = dict(app_name=ns.app_name, job_id=ns.job_id)
    log.info('Running bash job', extra=ld)
    cmd = get_bash_cmd(ns.app_name)
    if ns.bash_cmd:
        cmd += ' '.join(ns.bash_cmd)
        log.debug("Appending user-supplied bash options to defaults",
                  extra=dict(app_name=ns.app_name, job_id=job_id, cmd=cmd))
    ld.update(cmd=cmd)
    if not cmd:
        raise UserWarning(
            "You need to specify bash options or configure default bash"
            " options")

    _cmdargs = dict(**ns.__dict__)
    _cmdargs.update(api.parse_job_id(ns.app_name, job_id))
    cmd = cmd.format(**_cmdargs)

    if ns.redirect_to_stderr:
        _std = sys.stderr
    else:
        _std = PIPE

    log.info('running command', extra=ld)
    returncode, stdout, stderr = run(cmd,
                                     shell=True,
                                     timeout=ns.watch,
                                     stdout=_std,
                                     stderr=_std)
    ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld)
    if returncode == -9:
        log_and_raise("Bash job timed out", ld)
    elif returncode != 0:
        # this raises an error and logs output:
        log_and_raise("Bash job failed", ld)
    else:
        log.info("Bash job succeeded", extra=ld)
Beispiel #6
0
def main(ns):
    """
    A generic plugin that schedules arbitrary bash jobs using Stolos

    Assume code is written in Python.  For Scala or R code, use another option.
    """
    job_id = ns.job_id
    ld = dict(app_name=ns.app_name, job_id=ns.job_id)
    log.info('Running bash job', extra=ld)
    cmd = get_bash_cmd(ns.app_name)
    if ns.bash_cmd:
        cmd += ' '.join(ns.bash_cmd)
        log.debug(
            "Appending user-supplied bash options to defaults", extra=dict(
                app_name=ns.app_name, job_id=job_id, cmd=cmd))
    ld.update(cmd=cmd)
    if not cmd:
        raise UserWarning(
            "You need to specify bash options or configure default bash"
            " options")

    _cmdargs = dict(**ns.__dict__)
    _cmdargs.update(api.parse_job_id(ns.app_name, job_id))
    cmd = cmd.format(**_cmdargs)

    if ns.redirect_to_stderr:
        _std = sys.stderr
    else:
        _std = PIPE

    log.info('running command', extra=ld)
    returncode, stdout, stderr = run(
        cmd, shell=True, timeout=ns.watch, stdout=_std, stderr=_std)
    ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld)
    if returncode == -9:
        log_and_raise("Bash job timed out", ld)
    elif returncode != 0:
        # this raises an error and logs output:
        log_and_raise("Bash job failed", ld)
    else:
        log.info("Bash job succeeded", extra=ld)
Beispiel #7
0
def validate_env(app_name, env):
    if not hasattr(env, 'items'):
        log_and_raise(("pyspark app misconfigured:"
                       " env, if supplied, must be a key: value mapping"),
                      dict(app_name=app_name))
    for k, v in env.items():
        if not isinstance(k, six.string_types):
            log_and_raise(("pyspark app misconfigured:"
                           "invalid key.  expected string"),
                          dict(app_name=app_name, key=k))
        if not isinstance(v, six.string_types):
            log_and_raise(("pyspark app misconfigured:"
                           "invalid value.  expected string"),
                          dict(app_name=app_name, value=v))
Beispiel #8
0
def validate_env(app_name, env):
    if not hasattr(env, 'items'):
        log_and_raise(
            ("pyspark app misconfigured:"
             " env, if supplied, must be a key: value mapping"),
            dict(app_name=app_name))
    for k, v in env.items():
        if not isinstance(k, six.string_types):
            log_and_raise(
                ("pyspark app misconfigured:"
                 "invalid key.  expected string"),
                dict(app_name=app_name, key=k))
        if not isinstance(v, six.string_types):
            log_and_raise(
                ("pyspark app misconfigured:"
                 "invalid value.  expected string"),
                dict(app_name=app_name, value=v))
Beispiel #9
0
def apply_data_transform(ns, sc, log_details, pjob_id, module):
    """Pass control to the module.main method.  If module.main specifies a
    `textFile` parameter, pass the textFile instance.  Otherwise, just map
    module.main on the RDD
    """
    func_args = inspect.getargspec(module.main).args
    if 'sc' in func_args:
        log.info(
            'passing spark context to a module.main function',
            extra=log_details)
        try:
            module.main(sc=sc, ns=ns, **pjob_id)
        except Exception as err:
            log_and_raise(
                "Job failed with error: %s" % err, log_details)
    else:
        read_fp = format_fp(ns.read_fp, ns, pjob_id)
        log_details = dict(read_fp=read_fp, **log_details)
        tf = sc.textFile(read_fp, ns.minPartitions)
        tf = pre_process_data(ns=ns, tf=tf, log_details=log_details)
        if 'textFile' in func_args:
            log.info(
                'passing textFile instance to a module.main function',
                extra=log_details)
            try:
                module.main(textFile=tf, ns=ns, **pjob_id)
            except Exception as err:
                log_and_raise(
                    "Job failed with error: %s" % err, log_details)

        else:
            write_fp = format_fp(ns.write_fp, ns, pjob_id)
            log.info(
                'mapping a module.main function to all elements in a textFile'
                ' and writing output',
                extra=dict(write_fp=write_fp, **log_details))
            try:
                (
                    tf
                    .map(functools.partial(module.main, ns=ns, **pjob_id))
                    .saveAsTextFile(write_fp)
                )
            except Exception as err:
                log_and_raise(err, log_details)
Beispiel #10
0
def validate_spark_conf(app_name, conf):
    # spark_conf - Is it a dict of str: str pairs?
    if not isinstance(conf, (dict, TasksConfigBaseMapping)):
        log_and_raise(("pyspark app improperly configured:"
                       " spark_conf must be a key:value mapping."),
                      dict(app_name=app_name))

    for k, v in conf.items():
        if not isinstance(k, six.string_types):
            log_and_raise(
                "pyspark app improperly configured:"
                " Key in spark_conf must be a string",
                dict(app_name=app_name, key=k, key_type=type(k)))
        if not isinstance(v, six.string_types + six.integer_types +
                          (bool, float)):
            log_and_raise(("pyspark app improperly configured:"
                           "Value for given key in spark_conf must be an"
                           " int, string or bool"),
                          dict(key=k, value_type=type(v), app_name=app_name))
Beispiel #11
0
def validate_spark_conf(app_name, conf):
    # spark_conf - Is it a dict of str: str pairs?
    if not isinstance(conf, (dict, TasksConfigBaseMapping)):
        log_and_raise(
            ("pyspark app improperly configured:"
             " spark_conf must be a key:value mapping."),
            dict(app_name=app_name))

    for k, v in conf.items():
        if not isinstance(k, six.string_types):
            log_and_raise(
                "pyspark app improperly configured:"
                " Key in spark_conf must be a string",
                dict(app_name=app_name, key=k, key_type=type(k)))
        if not isinstance(v, six.string_types + six.integer_types +
                          (bool, float)):
            log_and_raise(
                ("pyspark app improperly configured:"
                 "Value for given key in spark_conf must be an"
                 " int, string or bool"),
                dict(key=k, value_type=type(v), app_name=app_name))