def run_glue_job_from_local_folder_template(local_base, s3_base_path, name, role, job_args = None, allocated_capacity = None, max_retries = None, max_concurrent_runs = None):
    """
    Take a local folder layed out using our agreed folder spec, upload to s3, and run

    job_args is a dictionary that is passed to the glue job when it is run on aws.
    """

    local_base = _end_with_slash(local_base)
    s3_base_path = _end_with_slash(s3_base_path)

    # Create kwargs for job defintion these will be used in glue_create_job_defintion
    job_def_kwargs = {}
    job_def_kwargs['Name'] = name
    job_def_kwargs['Role'] = role
    if allocated_capacity is not None :
        job_def_kwargs['AllocatedCapacity'] = allocated_capacity
    if max_retries is not None :
        job_def_kwargs['MaxRetries'] = max_retries
    if max_concurrent_runs is not None :
        job_def_kwargs['MaxConcurrentRuns'] = max_concurrent_runs

    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)
    bucket_folder = bucket_folder + '/'
    delete_folder_from_bucket(bucket, bucket_folder)

    glue_job_folder_to_s3(local_base, s3_base_path)

    job_spec = glue_folder_in_s3_to_job_spec(s3_base_path, **job_def_kwargs)

    response = glue_client.create_job(**job_spec)
    if job_args:
        response = glue_client.start_job_run(JobName=name, Arguments = job_args)
    else:
       response = glue_client.start_job_run(JobName=name)
    return response, job_spec
def all_glue_job_folders_to_s3(local_glue_jobs_dir, s3_glue_jobs_dir, include_folders = None, exclude_folders = None) :
    """
    Iterate though all folders in the glue_job dir and upload them to a corresponsing 
    glue_job dir in s3. Each folder in local_glue_jobs_dir is uploaded using glue_job_folder_to_s3.
    Provide list of folder glue_job folder names in include_folders and exclude_folders to include and exclude them from the upload. 
    """
    local_glue_jobs_dir = _end_with_slash(local_glue_jobs_dir)

    # Do checks
    if not (include_folders is None or isinstance(include_folders, list)) :
        raise ValueError('include_folders must be a list or None')
    if not (exclude_folders is None or isinstance(exclude_folders, list)) : 
        raise ValueError('exclude_folders must be a list or None')

    if include_folders is not None and exclude_folders is not None :
        if len(set(include_folders).intersection(set(exclude_folders))) != 0 :
            raise ValueError('Some folders are listed in both include_folders and exclude_folders')

    # Create list of folders
    glue_job_folders = [d for d in os.listdir(local_glue_jobs_dir) if os.path.isdir(os.path.join(local_glue_jobs_dir, d)) and d[0] != '.']
    if include_folders is not None :
        test_include = [i in glue_job_folders for i in include_folders]
        if not all(test_include) :
            raise ValueError('One of the folders listed in include_folders does not exist in {}'.format(local_glue_jobs_dir))
        else :
            glue_job_folders = include_folders
    if exclude_folders is not None :
        glue_job_folders = [g for g in glue_job_folders if g not in exclude_folders]

    s3_glue_jobs_dir = _end_with_slash(s3_glue_jobs_dir)

    for glue_job in glue_job_folders :
        glue_job_folder_to_s3(local_glue_jobs_dir + glue_job + '/', s3_glue_jobs_dir + glue_job + '/')
Esempio n. 3
0
def upload_directory_to_s3(dir_path, s3_dir_parent_path, regex = ".+(\.sql|\.json|\.csv|\.txt|\.py|\.sh)$") :
    
    # Make sure folder paths are correct
    dir_path = _end_with_slash(dir_path)
    s3_dir_parent_path = _end_with_slash(s3_dir_parent_path)
    
    dir_path_prefix = '/'.join(dir_path.split('/')[:-2])
    if dir_path_prefix != '' :
        dir_path_prefix = dir_path_prefix + '/'

    bucket, key = s3_path_to_bucket_key(s3_dir_parent_path)
    for root, directories, filenames in os.walk('init'):
        for filename in filenames: 
            f = os.path.join(root,filename)
            if re.match(regex, f) : 
                path_out = upload_file_to_s3_from_path(f, bucket, key + f.replace(dir_path_prefix, ''))
def run_glue_job_from_s3_folder_template(s3_glue_job_folder, name, role, job_args = None, allocated_capacity = None, max_retries = None, max_concurrent_runs = None) :
    
    s3_glue_job_folder = _end_with_slash(s3_glue_job_folder)
    
    job_def_kwargs = {}
    job_def_kwargs['Name'] = name
    job_def_kwargs['Role'] = role
    if allocated_capacity is not None :
        job_def_kwargs['AllocatedCapacity'] = allocated_capacity
    if max_retries is not None :
        job_def_kwargs['MaxRetries'] = max_retries
    if max_concurrent_runs is not None :
        job_def_kwargs['MaxConcurrentRuns'] = max_concurrent_runs

    bucket, bucket_folder = s3_path_to_bucket_key(s3_glue_job_folder)
    
    job_spec = glue_folder_in_s3_to_job_spec(s3_glue_job_folder, **job_def_kwargs)

    del_response = delete_job(name)
    response = glue_client.create_job(**job_spec)

    if job_args:
        response = glue_client.start_job_run(JobName=name, Arguments = job_args)
    else:
       response = glue_client.start_job_run(JobName=name)
    return response, job_spec
def get_glue_job_and_resources_from_s3(s3_base_path) :
    
    s3_base_path = _end_with_slash(s3_base_path)
    
    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)
    bucket_folder = bucket_folder[:-1]
    
    shared_bucket_folder = '/'.join(bucket_folder.split('/')[:-1]) + '/shared_job_resources'
    
    files_list = get_file_list_from_bucket(bucket, bucket_folder)
    
    if "{}/job.py".format(bucket_folder) not in files_list:
        raise ValueError("Cannot find job.py in the folder specified ({}), stopping".format(bucket_folder))
    else:
        job_path = "s3://{}/{}/job.py".format(bucket, bucket_folder)

    try : 
        shared_files_list = get_file_list_from_bucket(bucket, shared_bucket_folder)
    except :
        shared_files_list = []
    
    # Do py_resources
    py_resources = [f for f in files_list if "/glue_py_resources/" in f]
    py_shared_resources = [f for f in shared_files_list if "/glue_py_resources/" in f]
    
    py_resources = py_resources + py_shared_resources
    py_resources = ["s3://{}/{}".format(bucket, f) for f in py_resources]
    py_resources = ",".join(py_resources)
    
    # Do resources
    resources = [f for f in files_list if "/glue_resources/" in f]
    shared_resources = [f for f in shared_files_list if "/glue_resources/" in f]
    
    resources = resources + shared_resources
    resources = ["s3://{}/{}".format(bucket, f) for f in resources]
    resources = ",".join(resources)
    
    if " " in resources or " " in py_resources :
        raise ValueError("The files in glue_resources and glue_py_resources must not have spaces in their filenames")
    
    return (job_path, resources, py_resources)
def glue_folder_in_s3_to_job_spec(s3_base_path, **kwargs) :
    """
    Given a set of files uploaded to s3 in a specific format, use them to create a glue job
    """

    #Base path should be a folder.  Ensure ends in "/"
    # Otherwise listing the bucket could cause problems in e.g. the case there are two jobs, job_1 and job_12
    s3_base_path = _end_with_slash(s3_base_path)
    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)

    (job_path, resources, py_resources) = get_glue_job_and_resources_from_s3(s3_base_path)

    kwargs["ScriptLocation"] = job_path
    if resources != '':
        kwargs["extra-files"] = resources
    if py_resources != '':
        kwargs["extra-py-files"] = py_resources
    kwargs["TempDir"] = "s3://{}/{}/{}/temp_dir/".format(bucket, bucket_folder, kwargs["Name"])

    job_spec = create_glue_job_definition(**kwargs)

    return job_spec
Esempio n. 7
0
def get_file_list_from_bucket(bucket, bucket_folder) :
    bucket_folder = _end_with_slash(bucket_folder)
    contents = s3_client.list_objects(Bucket=bucket, Prefix=bucket_folder)
    files_list = [c["Key"] for c in contents["Contents"]]
    return files_list
def glue_job_folder_to_s3(local_base, s3_base_path):
    """
    Take a folder structure on local disk and transfer to s3.

    Folder must be formatted as follows:
    base dir
      job.py
      glue_py_resources/
        zip and python files
        zip_urls <- file containing urls of additional zip files e.g. on github
      glue_resources/
        txt, sql, json, or csv files

    The folder name base dir will be in the folder s3_path_to_glue_jobs_folder
    """
    local_base = _end_with_slash(local_base)
    s3_base_path = _end_with_slash(s3_base_path)

    base_dir_listing = os.listdir(local_base)
    
    # Upload job
    bucket, bucket_folder = s3_path_to_bucket_key(s3_base_path)
    bucket_folder = bucket_folder[:-1]

    # Check that there is at least a job.py in the given folder and then upload job if appropriate
    if 'job.py' not in base_dir_listing :
        if local_base.split('/')[-2] != 'shared_job_resources' :
            raise ValueError("Could not find job.py in base directory provided ({}), stopping.\nOnly folder allowed to have no job.py is a folder named shared_job_resources".format(local_base))
    else :
        local_job_path = os.path.join(local_base, "job.py")
        job_path = upload_file_to_s3_from_path(local_job_path, bucket, "{}/job.py".format(bucket_folder))

    # Upload all the .py or .zip files in resources
    # Check existence of folder, otherwise skip
    resources_path = os.path.join(local_base, "glue_resources")
    if os.path.isdir(resources_path):
        resource_listing = os.listdir(os.path.join(local_base, 'glue_resources'))
        regex = ".+(\.sql|\.json|\.csv|\.txt)$"
        resource_listing = [f for f in resource_listing if re.match(regex, f)]

        for f in resource_listing:
            resource_local_path = os.path.join(local_base, "glue_resources", f)
            path = upload_file_to_s3_from_path(resource_local_path, bucket, "{}/glue_resources/{}".format(bucket_folder,f))


    # Upload all the .py or .zip files in resources
    # Check existence of folder, otherwise skip
    py_resources_path = os.path.join(local_base, "glue_py_resources")
    delete_these_paths = []
    if os.path.isdir(py_resources_path):

        zip_urls_path = os.path.join(py_resources_path, "github_zip_urls.txt")
        if os.path.exists(zip_urls_path):

            with open(zip_urls_path, "r") as f:
                urls = f.readlines()

            urls = [url for url in urls if len(url) > 10]

            for i, url in enumerate(urls):

                this_zip_path = os.path.join(py_resources_path,"{}.zip".format(i))
                urlretrieve(url,this_zip_path)
                new_zip_path = unnest_github_zipfile_and_return_new_zip_path(this_zip_path)
                os.remove(this_zip_path)
                delete_these_paths.append(new_zip_path)


        resource_listing = os.listdir(os.path.join(local_base, 'glue_py_resources'))
        regex = ".+(\.py|\.zip)$"
        resource_listing = [f for f in resource_listing if re.match(regex, f)]

        for f in resource_listing:
            resource_local_path = os.path.join(local_base, "glue_py_resources", f)
            path = upload_file_to_s3_from_path(resource_local_path, bucket, "{}/glue_py_resources/{}".format(bucket_folder,f))

        # Remember to delete the files we downloaded
        for this_path in delete_these_paths:
            os.remove(this_path)