예제 #1
0
def upload_bootstrap(package_path, s3_client, launch_prefix):
    """
    Uploads the boostrap.sh script to be used by the emr job during the bootstrap step
    :param package_path:
    :param s3_client:
    :param launch_prefix:
    :return:
    """
    bucket, prefix = get_bucket_key(package_path)
    # package=prefix.split('/')[-1]
    bootstrap_file = open('scripts/bootstrap_template.sh', 'r')
    text = bootstrap_file.read().format(bucket=bucket,
                                        prefix=prefix,
                                        package=prefix.split('/')[-1])
    bootstrap_file.close()
    bootstrap = open('bootstrap.sh', 'w+')
    bootstrap.write(text)
    bootstrap.close()

    bootstrap_bucket, bootstrap_prefix = get_bucket_key(launch_prefix)
    s3_client.upload_file('bootstrap.sh', bootstrap_bucket,
                          f'{bootstrap_prefix}bootstrap.sh')
    os.remove('bootstrap.sh')

    return f's3://{bootstrap_bucket}/{bootstrap_prefix}bootstrap.sh'
예제 #2
0
def s3_copy(items: list, input_prefix: str, output_prefix: str, s3_client,
            retries: int):
    """
    Given a list of files to be copied, returns a list of the final locations of the copied files after performing
    the copying
    :param items: e.g. ['s3://bucket/prefix/file1.txt', 's3://bucket/prefix/file2.txt']
    :param input_prefix: e.g. 's3://bucket/prefix/'
    :param output_prefix: e.g. 's3://bucket2/prefix2/'
    :param s3_client: boto3 s3 client
    :param retries: e.g. 5
    :return: e.g. ['s3://bucket2/prefix2/file1.txt', 's3://bucket2/prefix2/file2.txt']
    """
    in_bucket, in_prefix = get_bucket_key(input_prefix)
    out_bucket, out_prefix = get_bucket_key(output_prefix)
    new_files = []
    for s3_key in items:
        result = individual_copy_file(file=s3_key,
                                      input_prefix=in_prefix,
                                      output_prefix=out_prefix,
                                      input_bucket=in_bucket,
                                      output_bucket=out_bucket,
                                      s3_client=s3_client,
                                      max_retries=retries)
        new_files.append(result)
    return new_files
예제 #3
0
def s3_list(s3, path: str, how: typing.Optional[ListType] = ListType.object_only) -> list:
    """

    :param how: full -> s3://bucket/prefix/file.txt | prefix -> prefix/file.txt | object_only -> file.txt (default)
    :param s3: boto3 s3 client
    :param str path: the s3 path to list e.g. s3://bucket/prefix/subprefix/
    :return list: list of tuples (key, last_modified) e.g. (prefix/file.txt, datetime(2020, 01, 01))
    """
    bucket, prefix = get_bucket_key(path)
    still_more = True
    continuation_token = None
    output = []
    while still_more:
        response = list_more(s3, bucket, prefix, continuation_token)
        if how == ListType.full:
            current_output = [(F"s3://{bucket}/{x['Key']}", x['LastModified'].replace(tzinfo=pytz.UTC))
                              for x in response['Contents'] if not x['Key'].endswith('/')]
        elif how == ListType.prefix:
            current_output = [(x['Key'], x['LastModified'].replace(tzinfo=pytz.UTC))
                              for x in response['Contents'] if not x['Key'].endswith('/')]
        elif how == ListType.object_only:
            current_output = [(x['Key'].split('/')[-1], x['LastModified'].replace(tzinfo=pytz.UTC))
                              for x in response['Contents'] if not x['Key'].endswith('/')]
        else:
            raise ValueError('how must be specified as one of a valid ListType')
        output.extend(current_output)
        if response['IsTruncated']:
            continuation_token = response['NextContinuationToken']
        else:
            still_more = False

    return output
예제 #4
0
def upload_new_run_info(last_run, run_info_prefix, cdc_paths, full_paths,
                        s3_client) -> str:
    """
    Uploads the new run info file to s3 to be used by the spark job downstream and returns the s3 path of the uploaded
    file
    :param last_run: The location of the specific last run info file, from a previous job run, if one exists
    :param run_info_prefix: The prefix where the current run info file will be stored, typically the prefix of
     the last_run location
    :param cdc_paths: A list of cdc prefixes to be included
    :param full_paths: A list of full load prefixes to be included
    :param s3_client: s3 boto3 client
    :return: The s3 path of the uploaded file
    """
    if last_run:
        this_run = get_old_new(s3=s3_client,
                               old_info=last_run,
                               cdc_prefixes=cdc_paths,
                               full_load_prefixes=full_paths)
    else:
        this_run = get_old_new(s3=s3_client,
                               cdc_prefixes=cdc_paths,
                               full_load_prefixes=full_paths)

    job_run_bucket, job_run_prefix = get_bucket_key(run_info_prefix)
    this_run_id = this_run['run_id']
    new_run_name = f'run_{this_run_id}.json'
    this_run_file = open(new_run_name, 'w+')
    this_run_file.write(json.dumps(this_run))
    this_run_file.close()

    this_job_run_key = f"{job_run_prefix}{new_run_name}"
    s3_client.upload_file(new_run_name, job_run_bucket, this_job_run_key)
    this_job_run_full_path = f's3://{job_run_bucket}/{this_job_run_key}'
    os.remove(new_run_name)
    return this_job_run_full_path
예제 #5
0
def upload_start_job(script, spark_args, s3_client, job_run_full, launch_path):
    """
    Uploads the strart_job.sh script which is responsible for starting the pyspark script
    :param package_path:
    :param script:
    :param spark_args:
    :param s3_client:
    :param job_run_full:
    :param launch_path:
    :return:
    """

    start_job_file = open('scripts/start_job_template.sh', 'r')
    text = start_job_file.read().format(
        job_script=script, arguments=f"--run_info {job_run_full} {spark_args}")
    start_job_file.close()
    start_job = open('start_job.sh', 'w+')
    start_job.write(text)
    start_job.close()

    launch_bucket, launch_prefix = get_bucket_key(launch_path)
    s3_client.upload_file('start_job.sh', launch_bucket,
                          f'{launch_prefix}start_job.sh')
    os.remove('start_job.sh')
    return f's3://{launch_bucket}/{launch_prefix}start_job.sh'
예제 #6
0
def run(emr_client, inputs_file, last_run_path, s3_client):
    inputs_bucket, inputs_prefix = get_bucket_key(inputs_file)
    s3_client.download_file(inputs_bucket, inputs_prefix, 'inputs.json')
    inputs_file = open('inputs.json', 'r')
    inputs = json.loads(inputs_file.read())
    os.remove('inputs.json')

    # Bookmarking/file tracking file
    this_job_run_full_path = upload_new_run_info(
        last_run=last_run_path,
        run_info_prefix=inputs['run_info_prefix'],
        cdc_paths=inputs.get('cdc_paths'),
        full_paths=inputs.get('full_paths'),
        s3_client=s3_client)

    # Shell script that runs spark-submit to start the spark job
    start_path = upload_start_job(script=inputs['script'],
                                  spark_args=inputs['spark_args'],
                                  s3_client=s3_client,
                                  job_run_full=this_job_run_full_path,
                                  launch_path=inputs['launch_prefix'])
    # Shell script that installs requirements on all clusters
    bootstrap_path = upload_bootstrap(package_path=inputs['package'],
                                      s3_client=s3_client,
                                      launch_prefix=inputs['launch_prefix'])

    response = start_emr(emr_path=inputs['emr_details'],
                         s3_client=s3_client,
                         emr_client=emr_client,
                         start_path=start_path,
                         bootstrap_path=bootstrap_path)

    return response
예제 #7
0
def get_old_new(s3,
                cdc_prefixes: typing.Optional[list] = None,
                full_load_prefixes: typing.Optional[list] = None,
                old_info: typing.Optional[str] = None):
    """
    Gets the list of objects of old and new and compares them. Returns a dictionary that conforms to the old_info
    format which details the files that are yet to be processed.
    :param s3:
    :param old_info: s3 location of the last job run info
    :param cdc_prefixes: list of cdc prefixes
    :param full_load_prefixes: list of full_load prefixes
    :return:
    """
    if not cdc_prefixes and full_load_prefixes:
        raise ValueError(
            "cdc_info and full_load_info cannot both be null. One must be specified"
        )

    if old_info:
        old_bucket, old_prefix = get_bucket_key(old_info)
        s3.download_file(old_bucket, old_prefix, 'old_info.json')
        old_file = open("old_info.json", "r")
        old = json.loads(old_file.read())
        old_file.close()
        os.remove('old_info.json')
        new_run_id = old['run_id'] + 1
    else:
        # Assumes that there are no previous runs/no previously processed files
        old = {'cdc_files': {}}
        new_run_id = 0

    if cdc_prefixes:
        new_cdc = {}
        # Add any newly added identifiers, update previous prefixes, drop missing ones
        for prefix in cdc_prefixes:
            old_cdc = old['cdc_files']
            old_files = old_cdc.get(prefix, {}).get('files', [])
            since = old_cdc.get(prefix, {}).get('max_ts',
                                                "1970-01-01 00:00:00.000")
            files, max_ts = find_latest(old_files,
                                        s3_list(s3, prefix, ListType.full),
                                        since)
            new_cdc[prefix] = {'files': files, 'max_ts': max_ts}
    else:
        new_cdc = {}

    if full_load_prefixes:
        new_full = {}
        for prefix in full_load_prefixes:
            files = s3_list(s3, prefix, ListType.full)
            new_full[prefix] = {'files': [x[0] for x in files]}
    else:
        new_full = {}

    output = {
        'cdc_files': new_cdc,
        'full_load_files': new_full,
        'run_id': new_run_id
    }
    return output
예제 #8
0
def start_emr(emr_path, s3_client, emr_client, bootstrap_path, start_path):
    """
    Starts the emr job that will run the pyspark logic
    :param emr_path:
    :param s3_client:
    :param emr_client:
    :param bootstrap_path:
    :param start_path:
    :return:
    """
    config_bucket, config_prefix = get_bucket_key(emr_path)
    s3_client.download_file(config_bucket, config_prefix, 'emr_details.json')
    emr_details_file = open('emr_details.json', 'r')
    emr_details = json.loads(emr_details_file.read())
    os.remove('emr_details.json')

    DEFAULT_STEP['HadoopJarStep']['Args'].append(start_path)
    steps = [DEFAULT_STEP]
    steps.extend(emr_details['steps'])

    DEFAULT_BOOTSTRAP['ScriptBootstrapAction']['Path'] = bootstrap_path
    bootstraps = [DEFAULT_BOOTSTRAP]
    bootstraps.extend(emr_details['bootstraps'])

    response = emr_client.run_job_flow(
        Name=emr_details['name'],
        LogUri=emr_details['log_location'],
        ReleaseLabel=emr_details['emr_version'],
        Instances=emr_details['instances'],
        Steps=steps,
        BootstrapActions=bootstraps,
        Applications=emr_details['applications'],
        # Configurations=emr_details.get('configs'),
        VisibleToAllUsers=emr_details['visible_all_users'],
        JobFlowRole=emr_details['emr_ec2_role'],
        ServiceRole=emr_details['emr_service_role'],
        Tags=emr_details.get('tags'))
    return response
예제 #9
0
    def test_copy_threaded(self):
        s3 = boto3.client('s3')
        input_prefix = f's3://{BUCKET}/subprefix1/'
        output_prefix = f's3://{BUCKET2}/somenewprefix/'
        results = copy_prefix_threaded(s3=s3,
                                       in_prefix=input_prefix,
                                       out_prefix=output_prefix,
                                       threads=10)
        expected_results = []
        for x in range(0, 1000):
            expected_results.append(
                f"s3://{BUCKET2}/somenewprefix/subprefix2/{local_files[1].replace('b', str(x))}"
            )
        for x in range(1000, 2000):
            expected_results.append(
                f"s3://{BUCKET2}/somenewprefix/subprefix2/subprefix3/{local_files[1].replace('b', str(x))}"
            )
        self.assertCountEqual(results, expected_results)

        # Clean up
        for path in expected_results:
            bucket, prefix = get_bucket_key(path)
            s3.delete_object(Bucket=bucket, Key=prefix)