Beispiel #1
0
def s3_bucket_exists(bucket):
    """
    Code from Amazon docs for checking bucket existence.

    Args:
        bucket:

    Returns:
        booL: whether bucket exists

    """
    import botocore

    s3 = get_s3_resource()
    exists = True
    try:
        s3.meta.client.head_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        error_code = int(e.response['Error']['Code'])
        if error_code == 404:
            exists = False
        elif error_code == 403:
            # for buckets you can get a forbidden instead of resource not found
            # if you have the s3:ListBucket permission on the bucket, Amazon S3 will return a
            # HTTP status code 404 ("no such key") error. If you don't have the s3:ListBucket permission,
            # Amazon S3 will return a HTTP status code 403 ("access denied") error.
            _logger.info(
                "aws_s3: bucket {} raised a 403 (access forbidden), do you have ListBucket permission?"
                .format(bucket))
            exists = False
        else:
            raise
    return exists
Beispiel #2
0
def s3_path_exists(s3_url):
    """
    Given an entire path, does the key exist?

    If you're checking for partial key, make sure to end with '/'

    This is how you make "folders" in s3, you use a key ending with '/'
    e.g., s3://mybucket/onelevel/anotherdir/
    bucket = mybucket
    key = onelevel/anotherdir/ -- it's a zero size object.

    If checking for full path, you can end with thing itself.

    Args:
        s3_url:

    Returns:

    """
    import botocore

    s3 = get_s3_resource()
    bucket, key = split_s3_url(s3_url)
    if key is None:
        return s3_bucket_exists(bucket)

    try:
        s3.Object(bucket, key).load()
    except botocore.exceptions.ClientError as e:
        error_code = int(e.response['Error']['Code'])
        _logger.info("Error code {}".format(error_code))
        if error_code == 404:
            return False
        else:
            raise

    return True
Beispiel #3
0
def resolve_bundle(pfs, pipe, is_left_edge_task, data_context):
    """
    Args:
        pfs: pipe fs object
        pipe: the pipe to investigate
        is_left_edge_task: True if this task starts the DAG.
        data_context: the data context object from which we should resolve bundles.

    Returns:
        bool: True if bundle found (not re-running).  False if bundle not found or being regenerated

    """

    # These are constants
    verbose = False
    use_bundle = True
    regen_bundle = False

    # 1.) Get output bundle for pipe_id (the specific pipeline/transform/param hash).

    if verbose: print("resolve_bundle: looking up bundle {}".format(pipe.pipe_id()))

    if pipe._mark_force and not worker._is_external(pipe):
        # Forcing recomputation through a manual annotation in the pipe.pipe_requires() itself
        # If it is external, we don't recompute in any case.
        _logger.debug("resolve_bundle: pipe.mark_force forcing a new output bundle.")
        if verbose: print("resolve_bundle: pipe.mark_force forcing a new output bundle.\n")
        pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
        return regen_bundle

    if pipe.force:
        # Forcing recomputation through a manual --force directive
        _logger.debug("resolve_bundle: --force forcing a new output bundle.")
        if verbose: print("resolve_bundle: --force forcing a new output bundle.\n")
        pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
        return regen_bundle

    bndls = pfs.get_hframe_by_proc(pipe.pipe_id(), getall=True, data_context=data_context)
    if bndls is None or len(bndls) <= 0:
        if verbose: print("resolve_bundle: No bundle with proc_name {}, getting new output bundle.\n".format(pipe.pipe_id()))
        # no bundle, force recompute
        pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
        return regen_bundle

    bndl = bndls[0]  # our best guess is the most recent bundle with the same pipe_id()

    # 2.) Bundle exists - lineage object tells us input bundles.
    lng = bndl.get_lineage()
    if lng is None:
        if verbose: print("resolve_bundle: No lineage present, getting new output bundle.\n")
        pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
        return regen_bundle

    # 3.) Lineage record exists -- if new code, re-run
    pipeline_path = os.path.dirname(sys.modules[pipe.__module__].__file__)
    current_version = fs.DisdatFS().get_pipe_version(pipeline_path)

    if different_code_versions(current_version, lng):
        if verbose: print("resolve_bundle: New code version, getting new output bundle.\n")
        pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
        return regen_bundle

    # 3.5.) Have we changed the output human bundle name?  If so, re-run task.
    # Note: we need to go through all the bundle versions with that processing_id.
    # because, at the moment, we make new bundles when we change name.  When in some sense
    # it's just a tag set that should include other names and the data should be the same.

    current_human_name = pipe.pipeline_id()
    found = False
    for bndl in bndls:
        if current_human_name == bndl.get_human_name():
            found = True
            break
    if not found:
        if verbose: print("resolve_bundle: New human name {} (prior {}), getting new output bundle.\n".format(
            current_human_name, bndl.get_human_name()))
        pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
        return regen_bundle

    # 4.) Check the inputs -- assumes we have processed upstream tasks already
    for task in pipe.requires():
        """ Are we re-running an upstream input (look in path cache)?
        At this time the only bundles a task depends on are the ones created by its upstream tasks.
        We have to look through its *current* list of possible upstream tasks, not the ones it had
        on its prior run.   If the UUID has changed relative to lineage, then
        we need to re-run.
        
        In general, the only reason we should re-run an upstream is b/c of a code change.  And that change
        did not change the tasks parameters.  So it looks the same, but it is actually different.  OR someone 
        re-runs a sql query and the table has changed and the output changes those the parameters are the same. 
        Sometimes folks remove an output to force a single stage to re-run, just for that reason. 
        
        But if an output exists and we want to ignore code version and ignore data changes then
        while we do this, we should re-use our bundle independent of whether an upstream needs to re-run 
        or whether one of our inputs is out of date. 
        
        So one option is to ignore upstreams that need to be re-run.  Re-use blindly.  Like Luigi.  
        
        Another option is that anytime we don't have an input bundle, we attempt to read it not just
        locally, but remotely as well.   
        
        """
        pce = pfs.get_path_cache(task)

        LUIGI_RERUN = False

        if LUIGI_RERUN:
            # Ignore whether upstreams had to be re-run b/c they didn't have bundles.
            # Ignore whether this has to be re-run because existing inputs are newer
            continue

        if pce is None:
            # this can happen with bundles created by other pipelines.
            # still surface the warning, but no longer raise exception
            _logger.info(
                "Resolve bundles: input bundle {} with no path cache entry.  Likely produced by other pipesline".format(
                    task.task_id))
        else:
            if pce.rerun:
                if verbose: print("Resolve_bundle: an upstream task is in the pce and is being re-run, so we need to reun. getting new output bundle.\n")
                pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
                return regen_bundle

            local_bundle = pfs.get_hframe_by_proc(task.task_id, data_context=data_context)
            assert(local_bundle is not None)

            """ Now we need to check if we should re-run this task because an upstream input exists and has been updated        
            Go through each of the inputs used for this current task.  
            POLICY
            1.) if the date is more recent, it is "new" data.
            2.) if it is older, we should require force (but currently do not and re-run).
            XXX TODO: Add date to the depends_on pb data structure to enforce 2 XXX
            """
            for tup in lng.pb.depends_on:
                if tup.hframe_name == local_bundle.pb.processing_name and tup.hframe_uuid != local_bundle.pb.uuid:
                    if verbose: print("Resolve_bundle: prior input bundle {} {} has new uuid {}\n".format(
                        task.task_id,
                        tup.hframe_uuid,
                        local_bundle.pb.uuid))
                    pfs.new_output_hframe(pipe, is_left_edge_task, data_context=data_context)
                    return regen_bundle

    # 5.) Woot!  Reuse the found bundle.
    if verbose: print("resolve_bundle: reusing bundle\n")
    pfs.reuse_hframe(pipe, bndl, is_left_edge_task, data_context=data_context)
    return use_bundle
Beispiel #4
0
def _run_local(cli, pipeline_setup_file, arglist, backend):
    """
    Run container locally or run sagemaker container locally
    Args:
        cli (bool): Whether we were called from the CLI or API
        pipeline_setup_file (str): The FQ path to the setup.py used to dockerize the pipeline.
        arglist:
        backend:

    Returns:
        output (str): Returns None if there is a failure

    """

    on_macos = False
    if platform == "darwin":
        on_macos = True

    client = docker.from_env()

    environment = {}
    if 'AWS_PROFILE' in os.environ:
        environment['AWS_PROFILE'] = os.environ['AWS_PROFILE']

    environment[common.LOCAL_EXECUTION] = 'True'

    # Todo: Local runs do not yet set resource limits, but when they do, we'll have to set this
    #environment['DISDAT_CPU_COUNT'] = vcpus

    volumes = {}
    aws_config_dir = os.getenv('AWS_CONFIG_DIR',
                               os.path.join(os.environ['HOME'], '.aws'))
    if aws_config_dir is not None and os.path.exists(aws_config_dir):
        volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'}

    local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir()
    volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'}

    try:
        if backend == Backend.LocalSageMaker:
            pipeline_image_name = common.make_sagemaker_project_image_name(
                pipeline_setup_file)
            tempdir = tempfile.mkdtemp()
            with open(os.path.join(tempdir, 'hyperparameters.json'),
                      'w') as of:
                json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of)
                args = ['train']  # rewrite to just 'train'
                # On mac OS, tempdir returns /var, but is actually /private/var
                # Add /private since it that dir is shared (and not /var) with Docker.
                if on_macos:
                    localdir = os.path.join('/private', tempdir[1:])
                else:
                    localdir = tempdir
                volumes[localdir] = {
                    'bind': '/opt/ml/input/config/',
                    'mode': 'rw'
                }
                _logger.info("VOLUMES: {}".format(volumes))
        else:
            # Add the actual command to the arglist (for non-sagemaker runs)
            arglist = [ENTRYPOINT_BIN] + arglist
            pipeline_image_name = common.make_project_image_name(
                pipeline_setup_file)

        _logger.debug('Running image {} with arguments {}'.format(
            pipeline_image_name, arglist))

        stdout = client.containers.run(pipeline_image_name,
                                       arglist,
                                       detach=False,
                                       environment=environment,
                                       init=True,
                                       stderr=True,
                                       volumes=volumes)
        stdout = six.ensure_str(stdout)
        if cli: print(stdout)
        return stdout
    except docker.errors.ContainerError as ce:
        _logger.error(
            "Internal error running image {}".format(pipeline_image_name))
        _logger.error("Error: {}".format(six.ensure_str(ce.stderr)))
        return six.ensure_str(ce)
    except docker.errors.ImageNotFound:
        _logger.error(
            "Unable to find the docker image {}".format(pipeline_image_name))
        return None
Beispiel #5
0
def _run_aws_sagemaker(arglist, fq_repository_name, job_name):
    """
    Runs a training job on AWS SageMaker.  This uses the default machine type
    in the disdat.cfg file.

    Args:
        arglist:
        fq_repository_name (str): fully qualified repository name
        job_name:  instance job name

    Returns:
        TrainingJobArn (str)
    """

    disdat_config = DisdatConfig.instance()

    job_name = job_name.replace(
        '_',
        '-')  # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])*

    hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist)

    algorithm_specification = {
        'TrainingImage': fq_repository_name,
        'TrainingInputMode': 'File'
    }

    role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn')

    input_channel_config = [
        {
            'ChannelName': 'disdat_sagemaker_input_blackhole',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType':
                    'S3Prefix',
                    'S3Uri':
                    disdat_config.parser.get(_MODULE_NAME,
                                             'aws_sagemaker_s3_input_uri'),
                    'S3DataDistributionType':
                    'FullyReplicated'
                }
            },
            'ContentType': 'application/javascript',
            'CompressionType': 'None',  # | 'Gzip',
            'RecordWrapperType': 'None'  # | 'RecordIO'
        },
    ]

    output_data_config = {
        'S3OutputPath':
        os.path.join(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_s3_output_uri'), job_name)
    }

    resource_config = {
        'InstanceType':
        disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'),
        'InstanceCount':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_instance_count')),
        'VolumeSizeInGB':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_volume_sizeGB'))
        # 'VolumeKmsKeyId': 'string'
    }

    vpc_config = None  #'SecurityGroupIds': [], 'Subnets': []}

    stopping_condition = {
        'MaxRuntimeInSeconds':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_max_runtime_sec'))
    }

    tags = [{
        'Key': 'user',
        'Value': 'disdat'
    }, {
        'Key': 'job',
        'Value': job_name
    }]

    if False:
        print("Disdat SageMaker configs")
        print("job name: {}".format(job_name))
        print("hparams: {}".format(hyperparameter_dict))
        print("algorithm: {}".format(algorithm_specification))
        print("Role ARN: {}".format(role_arn))
        print("Input data conf: {}".format(input_channel_config))
        print("Output data conf: {}".format(output_data_config))
        print("Resource conf: {}".format(resource_config))
        print("VPC conf: {}".format(vpc_config))
        print("Stopping condition seconds: {}".format(stopping_condition))
        print("Tags: {}".format(tags))

    client = b3.client('sagemaker', region_name=aws.profile_get_region())

    response = client.create_training_job(
        TrainingJobName=job_name,
        HyperParameters=hyperparameter_dict,
        AlgorithmSpecification=algorithm_specification,
        RoleArn=role_arn,
        InputDataConfig=input_channel_config,
        OutputDataConfig=output_data_config,
        ResourceConfig=resource_config,
        StoppingCondition=stopping_condition,
        Tags=tags)

    _logger.info(
        "Disdat SageMaker create_training_job response {}".format(response))
    return response['TrainingJobArn']
Beispiel #6
0
def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name,
                   aws_session_token_duration, vcpus, memory, no_submit,
                   job_role_arn):
    """
    Run job on AWS Batch.   Sends to queue configured in disdat.cfg.
    This assumes that you have already created a cluster that will run the jobs
    that have been assigned to that queue.

    Args:
        arglist:
        fq_repository_name (str): The fully qualified docker repository name
        job_name:
        pipeline_image_name:
        aws_session_token_duration:
        vcpus:
        memory:
        no_submit (bool): default False
        job_role_arn (str): Can be None

    Returns:

    """
    def check_role_arn(job_dict, jra):
        """ Check to see if the job desc dictionary contains the same job_role_arn (jra)
        """

        if jra is None:
            if 'jobRoleArn' not in job_dict['containerProperties']:
                return True
        else:
            if 'jobRoleArn' in job_dict['containerProperties']:
                if job_dict['containerProperties']['jobRoleArn'] == jra:
                    return True
        return False

    disdat_config = DisdatConfig.instance()

    # Get the parameter values required to kick off an AWS Batch job.
    # Every batch job must:
    # 1. Have a name
    # 2. Have a job definition that declares which ECR-hosted Docker
    #    image to use.
    # 3. Have a queue that feeds jobs into a compute cluster.
    # 4. The command to execute inside the Docker image; the command
    #    args are more-or-less the same as the ones used to execute
    #    locally using 'dsdt run'

    # Create a Job Definition and upload it.
    # We create per-user job definitions so multiple users do not clobber each other.
    # In addition, we never re-use a job definition, since the user may update
    # the vcpu or memory requirements and those are stuck in the job definition

    job_definition_name = aws.batch_get_job_definition_name(
        pipeline_image_name)

    if disdat_config.parser.has_option(_MODULE_NAME,
                                       'aws_batch_job_definition'):
        job_definition_name = disdat_config.parser.get(
            _MODULE_NAME, 'aws_batch_job_definition')

    # TODO: Look through all of history to find one that matches?
    # TODO: Delete old jobs here or let user do it?
    job_definition_obj = aws.batch_get_latest_job_definition(
        job_definition_name)

    if (job_definition_obj is not None
            and job_definition_obj['containerProperties']['image']
            == fq_repository_name
            and job_definition_obj['containerProperties']['vcpus'] == vcpus
            and job_definition_obj['containerProperties']['memory'] == memory
            and check_role_arn(job_definition_obj, job_role_arn)):

        job_definition_fqn = aws.batch_extract_job_definition_fqn(
            job_definition_obj)

        _logger.info("Re-using prior AWS Batch run job definition : {}".format(
            job_definition_obj))

    else:
        """ Whether None or doesn't match, make a new one """

        job_definition_obj = aws.batch_register_job_definition(
            job_definition_name,
            fq_repository_name,
            vcpus=vcpus,
            memory=memory,
            job_role_arn=job_role_arn)

        job_definition_fqn = aws.batch_get_job_definition(job_definition_name)

        _logger.info(
            "New AWS Batch run job definition {}".format(job_definition_fqn))

    if no_submit:
        # Return the job description object
        return job_definition_obj

    job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

    container_overrides = {'command': arglist}

    # Through the magic of boto3_session_cache, the client in our script
    # here can get at AWS profiles and temporary AWS tokens created in
    # part from MFA tokens generated through the user's shells; we don't
    # have to write special code of our own to deal with authenticating
    # with AWS.
    client = b3.client('batch', region_name=aws.profile_get_region())
    # A bigger problem might be that the IAM role executing the job on
    # a batch EC2 instance might not have access to the S3 remote. To
    # get around this, allow the user to create some temporary AWS
    # credentials.

    if aws_session_token_duration > 0 and job_role_arn is None:
        sts_client = b3.client('sts')
        try:
            token = sts_client.get_session_token(
                DurationSeconds=aws_session_token_duration)
            credentials = token['Credentials']
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials['AccessKeyId']
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials['SecretAccessKey']
            }, {
                'name':
                'AWS_SESSION_TOKEN',
                'value':
                credentials['SessionToken']
            }]
        except Exception as e:
            _logger.debug(
                "Unable to generate an STS token, instead trying users default credentials..."
            )
            credentials = b3.session.Session().get_credentials()
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials.access_key
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials.secret_key
            }, {
                'name': 'AWS_SESSION_TOKEN',
                'value': credentials.token
            }]

    container_overrides['environment'].append({
        'name': 'DISDAT_CPU_COUNT',
        'value': str(vcpus)
    })

    job = client.submit_job(jobName=job_name,
                            jobDefinition=job_definition_fqn,
                            jobQueue=job_queue,
                            containerOverrides=container_overrides)

    status = job['ResponseMetadata']['HTTPStatusCode']
    if status == 200:
        _logger.info(
            'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'
            .format(job['jobName'], job['jobId'], job_definition_fqn,
                    job_queue))
        return job
    else:
        _logger.error('Job submission failed: HTTP Status {}'.format())
        return None