Ejemplos de s3_path_to_bucket_key en Python, ejemplos de dataengineeringutils.s3.s3_path_to_bucket_key en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: glue.py Proyecto: uk-gov-mirror/moj-analytical-services.dataengineeringutils

def run_glue_job_from_local_folder_template(local_base, s3_base_path, name, role, job_args = None, allocated_capacity = None, max_retries = None, max_concurrent_runs = None):
    """
    Take a local folder layed out using our agreed folder spec, upload to s3, and run

    job_args is a dictionary that is passed to the glue job when it is run on aws.
    """

    local_base = _end_with_slash(local_base)
    s3_base_path = _end_with_slash(s3_base_path)

    # Create kwargs for job defintion these will be used in glue_create_job_defintion
    job_def_kwargs = {}
    job_def_kwargs['Name'] = name
    job_def_kwargs['Role'] = role
    if allocated_capacity is not None :
        job_def_kwargs['AllocatedCapacity'] = allocated_capacity
    if max_retries is not None :
        job_def_kwargs['MaxRetries'] = max_retries
    if max_concurrent_runs is not None :
        job_def_kwargs['MaxConcurrentRuns'] = max_concurrent_runs

    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)
    bucket_folder = bucket_folder + '/'
    delete_folder_from_bucket(bucket, bucket_folder)

    glue_job_folder_to_s3(local_base, s3_base_path)

    job_spec = glue_folder_in_s3_to_job_spec(s3_base_path, **job_def_kwargs)

    response = glue_client.create_job(**job_spec)
    if job_args:
        response = glue_client.start_job_run(JobName=name, Arguments = job_args)
    else:
       response = glue_client.start_job_run(JobName=name)
    return response, job_spec

Ejemplo n.º 2

0

Mostrar archivo

Archivo: glue.py Proyecto: uk-gov-mirror/moj-analytical-services.dataengineeringutils

def run_glue_job_from_s3_folder_template(s3_glue_job_folder, name, role, job_args = None, allocated_capacity = None, max_retries = None, max_concurrent_runs = None) :
    
    s3_glue_job_folder = _end_with_slash(s3_glue_job_folder)
    
    job_def_kwargs = {}
    job_def_kwargs['Name'] = name
    job_def_kwargs['Role'] = role
    if allocated_capacity is not None :
        job_def_kwargs['AllocatedCapacity'] = allocated_capacity
    if max_retries is not None :
        job_def_kwargs['MaxRetries'] = max_retries
    if max_concurrent_runs is not None :
        job_def_kwargs['MaxConcurrentRuns'] = max_concurrent_runs

    bucket, bucket_folder = s3_path_to_bucket_key(s3_glue_job_folder)
    
    job_spec = glue_folder_in_s3_to_job_spec(s3_glue_job_folder, **job_def_kwargs)

    del_response = delete_job(name)
    response = glue_client.create_job(**job_spec)

    if job_args:
        response = glue_client.start_job_run(JobName=name, Arguments = job_args)
    else:
       response = glue_client.start_job_run(JobName=name)
    return response, job_spec

Ejemplo n.º 3

0

Mostrar archivo

Archivo: glue.py Proyecto: uk-gov-mirror/moj-analytical-services.dataengineeringutils

def delete_all_target_data_from_database(database_metadata_path):
    files = os.listdir(database_metadata_path)
    files = set([f for f in files if re.match(".+\.json$", f)])

    if "database.json" in files:
        db_metadata = read_json(os.path.join(database_metadata_path, "database.json"))
        database_name = db_metadata["name"]
    else:
        raise ValueError("database.json not found in metadata folder")
        return None

    table_paths = files.difference({"database.json"})
    for table_path in table_paths:
        table_path = os.path.join(database_metadata_path, table_path)
        table_metadata = read_json(table_path)
        location = table_metadata["location"]
        bucket, bucket_folder = s3_path_to_bucket_key(location)
        delete_folder_from_bucket(bucket, bucket_folder)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: glue.py Proyecto: uk-gov-mirror/moj-analytical-services.dataengineeringutils

def get_glue_job_and_resources_from_s3(s3_base_path) :
    
    s3_base_path = _end_with_slash(s3_base_path)
    
    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)
    bucket_folder = bucket_folder[:-1]
    
    shared_bucket_folder = '/'.join(bucket_folder.split('/')[:-1]) + '/shared_job_resources'
    
    files_list = get_file_list_from_bucket(bucket, bucket_folder)
    
    if "{}/job.py".format(bucket_folder) not in files_list:
        raise ValueError("Cannot find job.py in the folder specified ({}), stopping".format(bucket_folder))
    else:
        job_path = "s3://{}/{}/job.py".format(bucket, bucket_folder)

    try : 
        shared_files_list = get_file_list_from_bucket(bucket, shared_bucket_folder)
    except :
        shared_files_list = []
    
    # Do py_resources
    py_resources = [f for f in files_list if "/glue_py_resources/" in f]
    py_shared_resources = [f for f in shared_files_list if "/glue_py_resources/" in f]
    
    py_resources = py_resources + py_shared_resources
    py_resources = ["s3://{}/{}".format(bucket, f) for f in py_resources]
    py_resources = ",".join(py_resources)
    
    # Do resources
    resources = [f for f in files_list if "/glue_resources/" in f]
    shared_resources = [f for f in shared_files_list if "/glue_resources/" in f]
    
    resources = resources + shared_resources
    resources = ["s3://{}/{}".format(bucket, f) for f in resources]
    resources = ",".join(resources)
    
    if " " in resources or " " in py_resources :
        raise ValueError("The files in glue_resources and glue_py_resources must not have spaces in their filenames")
    
    return (job_path, resources, py_resources)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: glue.py Proyecto: uk-gov-mirror/moj-analytical-services.dataengineeringutils

def glue_folder_in_s3_to_job_spec(s3_base_path, **kwargs) :
    """
    Given a set of files uploaded to s3 in a specific format, use them to create a glue job
    """

    #Base path should be a folder.  Ensure ends in "/"
    # Otherwise listing the bucket could cause problems in e.g. the case there are two jobs, job_1 and job_12
    s3_base_path = _end_with_slash(s3_base_path)
    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)

    (job_path, resources, py_resources) = get_glue_job_and_resources_from_s3(s3_base_path)

    kwargs["ScriptLocation"] = job_path
    if resources != '':
        kwargs["extra-files"] = resources
    if py_resources != '':
        kwargs["extra-py-files"] = py_resources
    kwargs["TempDir"] = "s3://{}/{}/{}/temp_dir/".format(bucket, bucket_folder, kwargs["Name"])

    job_spec = create_glue_job_definition(**kwargs)

    return job_spec

Ejemplo n.º 6

0

Mostrar archivo

Archivo: glue.py Proyecto: uk-gov-mirror/moj-analytical-services.dataengineeringutils

def glue_job_folder_to_s3(local_base, s3_base_path):
    """
    Take a folder structure on local disk and transfer to s3.

    Folder must be formatted as follows:
    base dir
      job.py
      glue_py_resources/
        zip and python files
        zip_urls <- file containing urls of additional zip files e.g. on github
      glue_resources/
        txt, sql, json, or csv files

    The folder name base dir will be in the folder s3_path_to_glue_jobs_folder
    """
    local_base = _end_with_slash(local_base)
    s3_base_path = _end_with_slash(s3_base_path)

    base_dir_listing = os.listdir(local_base)
    
    # Upload job
    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)
    bucket_folder = bucket_folder[:-1]

    # Check that there is at least a job.py in the given folder and then upload job if appropriate
    if 'job.py' not in base_dir_listing :
        if local_base.split('/')[-2] != 'shared_job_resources' :
            raise ValueError("Could not find job.py in base directory provided ({}), stopping.\nOnly folder allowed to have no job.py is a folder named shared_job_resources".format(local_base))
    else :
        local_job_path = os.path.join(local_base, "job.py")
        job_path = upload_file_to_s3_from_path(local_job_path, bucket, "{}/job.py".format(bucket_folder))

    # Upload all the .py or .zip files in resources
    # Check existence of folder, otherwise skip
    resources_path = os.path.join(local_base, "glue_resources")
    if os.path.isdir(resources_path):
        resource_listing = os.listdir(os.path.join(local_base, 'glue_resources'))
        regex = ".+(\.sql|\.json|\.csv|\.txt)$"
        resource_listing = [f for f in resource_listing if re.match(regex, f)]

        for f in resource_listing:
            resource_local_path = os.path.join(local_base, "glue_resources", f)
            path = upload_file_to_s3_from_path(resource_local_path, bucket, "{}/glue_resources/{}".format(bucket_folder,f))


    # Upload all the .py or .zip files in resources
    # Check existence of folder, otherwise skip
    py_resources_path = os.path.join(local_base, "glue_py_resources")
    delete_these_paths = []
    if os.path.isdir(py_resources_path):

        zip_urls_path = os.path.join(py_resources_path, "github_zip_urls.txt")
        if os.path.exists(zip_urls_path):

            with open(zip_urls_path, "r") as f:
                urls = f.readlines()

            urls = [url for url in urls if len(url) > 10]

            for i, url in enumerate(urls):

                this_zip_path = os.path.join(py_resources_path,"{}.zip".format(i))
                urlretrieve(url,this_zip_path)
                new_zip_path = unnest_github_zipfile_and_return_new_zip_path(this_zip_path)
                os.remove(this_zip_path)
                delete_these_paths.append(new_zip_path)


        resource_listing = os.listdir(os.path.join(local_base, 'glue_py_resources'))
        regex = ".+(\.py|\.zip)$"
        resource_listing = [f for f in resource_listing if re.match(regex, f)]

        for f in resource_listing:
            resource_local_path = os.path.join(local_base, "glue_py_resources", f)
            path = upload_file_to_s3_from_path(resource_local_path, bucket, "{}/glue_py_resources/{}".format(bucket_folder,f))

        # Remember to delete the files we downloaded
        for this_path in delete_these_paths:
            os.remove(this_path)