def handler(config, task, updater, main_input, path_input, merge_inputs): """ A simple example of a divide handler that downloads a dictionary of words from S3, splits it in half, and uploads the results back to S3. """ max_level = main_input['max_level'] s3_dir = main_input['s3_dir'] s3_file = main_input['s3_file'] updater.begin_task("Dividing input range at level {}".format( len(path_input))) response = harness.divide_response.DivideResponse(path_input) response.merge_task.task_input = main_input if len(path_input) < max_level: updater.begin_task("Fetching data to divide from S3") path_string = "".join([str(x) for x in path_input]) zip_name = "%s%s.zip" % (s3_file, path_string) text_name = "%s%s.txt" % (s3_file, path_string) in_zip_path = os.path.join(tempfile.gettempdir(), zip_name) config.s3_resource.meta.client.download_file( Bucket=config.config_bucket, Key=util.s3_key_join(s3_dir, zip_name), Filename=in_zip_path) # Split the data into separate files with zipfile.ZipFile(in_zip_path, "r") as in_zip: with in_zip.open(text_name, "r") as in_data: lines = in_data.readlines() split_count = len(lines) >> 1 batches = (lines[:split_count], lines[split_count:]) for idx, batch in enumerate(batches): updater.begin_task("Creating and uploading segment %d of 2" % (idx + 1)) file_prefix = "%s%s%d" % (s3_file, path_string, idx) out_text_name = file_prefix + ".txt" out_zip_name = file_prefix + ".zip" out_zip_path = os.path.join(tempfile.gettempdir(), out_zip_name) with zipfile.ZipFile(out_zip_path, "w", zipfile.ZIP_DEFLATED) as out_zip: with out_zip.open(out_text_name, "w") as out_text: out_text.write(b''.join(batch)) config.s3_resource.meta.client.upload_file( Filename=out_zip_path, Bucket=config.config_bucket, Key=util.s3_key_join(s3_dir, out_zip_name)) for x in [0, 1]: child_input = copy.deepcopy(main_input) response.add_divide_task(child_input, x) return response.as_dict()
def handler(config, task, updater, main_input, path_input, merge_inputs): """ A simple example of a build handler that downloads a segment of a dictionary from S3, sorts it using the built-in sort function, and uploads the results back to S3. """ updater.post_task_update("Downloading data from S3") s3_dir = main_input['s3_dir'] s3_file = main_input['s3_file'] path_string = "".join([str(x) for x in path_input]) file_prefix = "%s%s" % (s3_file, path_string) text_name = file_prefix + ".txt" zip_name = file_prefix + ".zip" zip_path = os.path.join(tempfile.gettempdir(), zip_name) config.s3_resource.meta.client.download_file(Bucket=config.config_bucket, Key=util.s3_key_join( s3_dir, zip_name), Filename=zip_path) updater.post_task_update("Processing data") with zipfile.ZipFile(zip_path, "r") as zp: with zp.open(text_name, "r") as fp: lines = fp.readlines() lines.sort(key=bytes.lower) with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zp: with zp.open(text_name, "w") as fp: fp.write(b"".join(lines)) updater.post_task_update("Uploading results to S3") config.s3_resource.meta.client.upload_file(Bucket=config.config_bucket, Key=util.s3_key_join( s3_dir, zip_name), Filename=zip_path) return {'zip_name': zip_name, 'tail': path_input[-1]}
def handler(config, task, updater, main_input, path_input, merge_inputs): """ A simple example of a merge handler that downloads sorted segments of a dictionary from S3, merges them together into a single sorted dictionary, and uploads the results back to S3. """ s3_dir = main_input['s3_dir'] s3_file = main_input['s3_file'] merge_sources = [] for idx, input in enumerate(merge_inputs): updater.post_task_update("Downloading chunk %d of %d from S3" % (idx + 1, len(merge_inputs))) zip_name = input['zip_name'] text_name = zip_name[:-3] + "txt" local_zip_path = os.path.join(tempfile.gettempdir(), zip_name) config.s3_resource.meta.client.download_file( Bucket=config.config_bucket, Key=util.s3_key_join(s3_dir, zip_name), Filename=local_zip_path) with zipfile.ZipFile(local_zip_path, "r") as zp: with zp.open(text_name, "r") as fp: lines = fp.readlines() merge_sources.append(LineSource(lines, len(merge_sources))) updater.post_task_update("Merging %d data sources" % len(merge_inputs)) out_lines = [] # Merge the multiple independently-sorted sources by popping the minimum off of each stack until there are none left while len(merge_sources) > 1: # Determine which source has the minimum value, and append its line to the output min_source = min(merge_sources, key=lambda x: x.current) out_lines.append(min_source.lines[min_source.index]) # Advance that source to the next line if not min_source.advance(): # We are out of lines remaining in this source, so discard it del merge_sources[min_source.list_position] # Renumber the remaining sources for idx, src in enumerate(merge_sources): src.list_position = idx # With only one source left, we can just dump the remaining lines from it to the output out_lines.extend(merge_sources[0].lines[merge_sources[0].index:]) # Write the output to a file out_basename = "%s_sorted%s" % (s3_file, "".join( [str(x) for x in path_input])) out_zip_name = out_basename + ".zip" out_txt_name = out_basename + ".txt" out_zip_path = os.path.join(tempfile.gettempdir(), out_zip_name) with zipfile.ZipFile(out_zip_path, "w", zipfile.ZIP_DEFLATED) as zp: with zp.open(out_txt_name, "w") as fp: fp.write(b"".join(out_lines)) updater.post_task_update("Uploading results to S3") config.s3_resource.meta.client.upload_file(Filename=out_zip_path, Bucket=config.config_bucket, Key=util.s3_key_join( s3_dir, out_zip_name)) return { 'zip_name': out_zip_name, 'tail': path_input[-1] if len(path_input) else None }