def upload_bootstrap(package_path, s3_client, launch_prefix): """ Uploads the boostrap.sh script to be used by the emr job during the bootstrap step :param package_path: :param s3_client: :param launch_prefix: :return: """ bucket, prefix = get_bucket_key(package_path) # package=prefix.split('/')[-1] bootstrap_file = open('scripts/bootstrap_template.sh', 'r') text = bootstrap_file.read().format(bucket=bucket, prefix=prefix, package=prefix.split('/')[-1]) bootstrap_file.close() bootstrap = open('bootstrap.sh', 'w+') bootstrap.write(text) bootstrap.close() bootstrap_bucket, bootstrap_prefix = get_bucket_key(launch_prefix) s3_client.upload_file('bootstrap.sh', bootstrap_bucket, f'{bootstrap_prefix}bootstrap.sh') os.remove('bootstrap.sh') return f's3://{bootstrap_bucket}/{bootstrap_prefix}bootstrap.sh'
def s3_copy(items: list, input_prefix: str, output_prefix: str, s3_client, retries: int): """ Given a list of files to be copied, returns a list of the final locations of the copied files after performing the copying :param items: e.g. ['s3://bucket/prefix/file1.txt', 's3://bucket/prefix/file2.txt'] :param input_prefix: e.g. 's3://bucket/prefix/' :param output_prefix: e.g. 's3://bucket2/prefix2/' :param s3_client: boto3 s3 client :param retries: e.g. 5 :return: e.g. ['s3://bucket2/prefix2/file1.txt', 's3://bucket2/prefix2/file2.txt'] """ in_bucket, in_prefix = get_bucket_key(input_prefix) out_bucket, out_prefix = get_bucket_key(output_prefix) new_files = [] for s3_key in items: result = individual_copy_file(file=s3_key, input_prefix=in_prefix, output_prefix=out_prefix, input_bucket=in_bucket, output_bucket=out_bucket, s3_client=s3_client, max_retries=retries) new_files.append(result) return new_files
def s3_list(s3, path: str, how: typing.Optional[ListType] = ListType.object_only) -> list: """ :param how: full -> s3://bucket/prefix/file.txt | prefix -> prefix/file.txt | object_only -> file.txt (default) :param s3: boto3 s3 client :param str path: the s3 path to list e.g. s3://bucket/prefix/subprefix/ :return list: list of tuples (key, last_modified) e.g. (prefix/file.txt, datetime(2020, 01, 01)) """ bucket, prefix = get_bucket_key(path) still_more = True continuation_token = None output = [] while still_more: response = list_more(s3, bucket, prefix, continuation_token) if how == ListType.full: current_output = [(F"s3://{bucket}/{x['Key']}", x['LastModified'].replace(tzinfo=pytz.UTC)) for x in response['Contents'] if not x['Key'].endswith('/')] elif how == ListType.prefix: current_output = [(x['Key'], x['LastModified'].replace(tzinfo=pytz.UTC)) for x in response['Contents'] if not x['Key'].endswith('/')] elif how == ListType.object_only: current_output = [(x['Key'].split('/')[-1], x['LastModified'].replace(tzinfo=pytz.UTC)) for x in response['Contents'] if not x['Key'].endswith('/')] else: raise ValueError('how must be specified as one of a valid ListType') output.extend(current_output) if response['IsTruncated']: continuation_token = response['NextContinuationToken'] else: still_more = False return output
def upload_new_run_info(last_run, run_info_prefix, cdc_paths, full_paths, s3_client) -> str: """ Uploads the new run info file to s3 to be used by the spark job downstream and returns the s3 path of the uploaded file :param last_run: The location of the specific last run info file, from a previous job run, if one exists :param run_info_prefix: The prefix where the current run info file will be stored, typically the prefix of the last_run location :param cdc_paths: A list of cdc prefixes to be included :param full_paths: A list of full load prefixes to be included :param s3_client: s3 boto3 client :return: The s3 path of the uploaded file """ if last_run: this_run = get_old_new(s3=s3_client, old_info=last_run, cdc_prefixes=cdc_paths, full_load_prefixes=full_paths) else: this_run = get_old_new(s3=s3_client, cdc_prefixes=cdc_paths, full_load_prefixes=full_paths) job_run_bucket, job_run_prefix = get_bucket_key(run_info_prefix) this_run_id = this_run['run_id'] new_run_name = f'run_{this_run_id}.json' this_run_file = open(new_run_name, 'w+') this_run_file.write(json.dumps(this_run)) this_run_file.close() this_job_run_key = f"{job_run_prefix}{new_run_name}" s3_client.upload_file(new_run_name, job_run_bucket, this_job_run_key) this_job_run_full_path = f's3://{job_run_bucket}/{this_job_run_key}' os.remove(new_run_name) return this_job_run_full_path
def upload_start_job(script, spark_args, s3_client, job_run_full, launch_path): """ Uploads the strart_job.sh script which is responsible for starting the pyspark script :param package_path: :param script: :param spark_args: :param s3_client: :param job_run_full: :param launch_path: :return: """ start_job_file = open('scripts/start_job_template.sh', 'r') text = start_job_file.read().format( job_script=script, arguments=f"--run_info {job_run_full} {spark_args}") start_job_file.close() start_job = open('start_job.sh', 'w+') start_job.write(text) start_job.close() launch_bucket, launch_prefix = get_bucket_key(launch_path) s3_client.upload_file('start_job.sh', launch_bucket, f'{launch_prefix}start_job.sh') os.remove('start_job.sh') return f's3://{launch_bucket}/{launch_prefix}start_job.sh'
def run(emr_client, inputs_file, last_run_path, s3_client): inputs_bucket, inputs_prefix = get_bucket_key(inputs_file) s3_client.download_file(inputs_bucket, inputs_prefix, 'inputs.json') inputs_file = open('inputs.json', 'r') inputs = json.loads(inputs_file.read()) os.remove('inputs.json') # Bookmarking/file tracking file this_job_run_full_path = upload_new_run_info( last_run=last_run_path, run_info_prefix=inputs['run_info_prefix'], cdc_paths=inputs.get('cdc_paths'), full_paths=inputs.get('full_paths'), s3_client=s3_client) # Shell script that runs spark-submit to start the spark job start_path = upload_start_job(script=inputs['script'], spark_args=inputs['spark_args'], s3_client=s3_client, job_run_full=this_job_run_full_path, launch_path=inputs['launch_prefix']) # Shell script that installs requirements on all clusters bootstrap_path = upload_bootstrap(package_path=inputs['package'], s3_client=s3_client, launch_prefix=inputs['launch_prefix']) response = start_emr(emr_path=inputs['emr_details'], s3_client=s3_client, emr_client=emr_client, start_path=start_path, bootstrap_path=bootstrap_path) return response
def get_old_new(s3, cdc_prefixes: typing.Optional[list] = None, full_load_prefixes: typing.Optional[list] = None, old_info: typing.Optional[str] = None): """ Gets the list of objects of old and new and compares them. Returns a dictionary that conforms to the old_info format which details the files that are yet to be processed. :param s3: :param old_info: s3 location of the last job run info :param cdc_prefixes: list of cdc prefixes :param full_load_prefixes: list of full_load prefixes :return: """ if not cdc_prefixes and full_load_prefixes: raise ValueError( "cdc_info and full_load_info cannot both be null. One must be specified" ) if old_info: old_bucket, old_prefix = get_bucket_key(old_info) s3.download_file(old_bucket, old_prefix, 'old_info.json') old_file = open("old_info.json", "r") old = json.loads(old_file.read()) old_file.close() os.remove('old_info.json') new_run_id = old['run_id'] + 1 else: # Assumes that there are no previous runs/no previously processed files old = {'cdc_files': {}} new_run_id = 0 if cdc_prefixes: new_cdc = {} # Add any newly added identifiers, update previous prefixes, drop missing ones for prefix in cdc_prefixes: old_cdc = old['cdc_files'] old_files = old_cdc.get(prefix, {}).get('files', []) since = old_cdc.get(prefix, {}).get('max_ts', "1970-01-01 00:00:00.000") files, max_ts = find_latest(old_files, s3_list(s3, prefix, ListType.full), since) new_cdc[prefix] = {'files': files, 'max_ts': max_ts} else: new_cdc = {} if full_load_prefixes: new_full = {} for prefix in full_load_prefixes: files = s3_list(s3, prefix, ListType.full) new_full[prefix] = {'files': [x[0] for x in files]} else: new_full = {} output = { 'cdc_files': new_cdc, 'full_load_files': new_full, 'run_id': new_run_id } return output
def start_emr(emr_path, s3_client, emr_client, bootstrap_path, start_path): """ Starts the emr job that will run the pyspark logic :param emr_path: :param s3_client: :param emr_client: :param bootstrap_path: :param start_path: :return: """ config_bucket, config_prefix = get_bucket_key(emr_path) s3_client.download_file(config_bucket, config_prefix, 'emr_details.json') emr_details_file = open('emr_details.json', 'r') emr_details = json.loads(emr_details_file.read()) os.remove('emr_details.json') DEFAULT_STEP['HadoopJarStep']['Args'].append(start_path) steps = [DEFAULT_STEP] steps.extend(emr_details['steps']) DEFAULT_BOOTSTRAP['ScriptBootstrapAction']['Path'] = bootstrap_path bootstraps = [DEFAULT_BOOTSTRAP] bootstraps.extend(emr_details['bootstraps']) response = emr_client.run_job_flow( Name=emr_details['name'], LogUri=emr_details['log_location'], ReleaseLabel=emr_details['emr_version'], Instances=emr_details['instances'], Steps=steps, BootstrapActions=bootstraps, Applications=emr_details['applications'], # Configurations=emr_details.get('configs'), VisibleToAllUsers=emr_details['visible_all_users'], JobFlowRole=emr_details['emr_ec2_role'], ServiceRole=emr_details['emr_service_role'], Tags=emr_details.get('tags')) return response
def test_copy_threaded(self): s3 = boto3.client('s3') input_prefix = f's3://{BUCKET}/subprefix1/' output_prefix = f's3://{BUCKET2}/somenewprefix/' results = copy_prefix_threaded(s3=s3, in_prefix=input_prefix, out_prefix=output_prefix, threads=10) expected_results = [] for x in range(0, 1000): expected_results.append( f"s3://{BUCKET2}/somenewprefix/subprefix2/{local_files[1].replace('b', str(x))}" ) for x in range(1000, 2000): expected_results.append( f"s3://{BUCKET2}/somenewprefix/subprefix2/subprefix3/{local_files[1].replace('b', str(x))}" ) self.assertCountEqual(results, expected_results) # Clean up for path in expected_results: bucket, prefix = get_bucket_key(path) s3.delete_object(Bucket=bucket, Key=prefix)