Esempio n. 1
0
def handler(config, task, updater, main_input, path_input, merge_inputs):
    """
    A simple example of a divide handler that downloads a dictionary of words from S3, splits it in half, and uploads
    the results back to S3.
    """
    max_level = main_input['max_level']
    s3_dir = main_input['s3_dir']
    s3_file = main_input['s3_file']

    updater.begin_task("Dividing input range at level {}".format(
        len(path_input)))

    response = harness.divide_response.DivideResponse(path_input)
    response.merge_task.task_input = main_input

    if len(path_input) < max_level:
        updater.begin_task("Fetching data to divide from S3")

        path_string = "".join([str(x) for x in path_input])
        zip_name = "%s%s.zip" % (s3_file, path_string)
        text_name = "%s%s.txt" % (s3_file, path_string)
        in_zip_path = os.path.join(tempfile.gettempdir(), zip_name)

        config.s3_resource.meta.client.download_file(
            Bucket=config.config_bucket,
            Key=util.s3_key_join(s3_dir, zip_name),
            Filename=in_zip_path)

        # Split the data into separate files
        with zipfile.ZipFile(in_zip_path, "r") as in_zip:
            with in_zip.open(text_name, "r") as in_data:
                lines = in_data.readlines()

        split_count = len(lines) >> 1
        batches = (lines[:split_count], lines[split_count:])

        for idx, batch in enumerate(batches):
            updater.begin_task("Creating and uploading segment %d of 2" %
                               (idx + 1))

            file_prefix = "%s%s%d" % (s3_file, path_string, idx)
            out_text_name = file_prefix + ".txt"
            out_zip_name = file_prefix + ".zip"
            out_zip_path = os.path.join(tempfile.gettempdir(), out_zip_name)

            with zipfile.ZipFile(out_zip_path, "w",
                                 zipfile.ZIP_DEFLATED) as out_zip:
                with out_zip.open(out_text_name, "w") as out_text:
                    out_text.write(b''.join(batch))

            config.s3_resource.meta.client.upload_file(
                Filename=out_zip_path,
                Bucket=config.config_bucket,
                Key=util.s3_key_join(s3_dir, out_zip_name))

        for x in [0, 1]:
            child_input = copy.deepcopy(main_input)
            response.add_divide_task(child_input, x)

    return response.as_dict()
Esempio n. 2
0
def handler(config, task, updater, main_input, path_input, merge_inputs):
    """
    A simple example of a build handler that downloads a segment of a dictionary from S3, sorts it using the built-in
    sort function, and uploads the results back to S3.
    """
    updater.post_task_update("Downloading data from S3")

    s3_dir = main_input['s3_dir']
    s3_file = main_input['s3_file']
    path_string = "".join([str(x) for x in path_input])
    file_prefix = "%s%s" % (s3_file, path_string)
    text_name = file_prefix + ".txt"
    zip_name = file_prefix + ".zip"
    zip_path = os.path.join(tempfile.gettempdir(), zip_name)

    config.s3_resource.meta.client.download_file(Bucket=config.config_bucket,
                                                 Key=util.s3_key_join(
                                                     s3_dir, zip_name),
                                                 Filename=zip_path)

    updater.post_task_update("Processing data")

    with zipfile.ZipFile(zip_path, "r") as zp:
        with zp.open(text_name, "r") as fp:
            lines = fp.readlines()

    lines.sort(key=bytes.lower)

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zp:
        with zp.open(text_name, "w") as fp:
            fp.write(b"".join(lines))

    updater.post_task_update("Uploading results to S3")

    config.s3_resource.meta.client.upload_file(Bucket=config.config_bucket,
                                               Key=util.s3_key_join(
                                                   s3_dir, zip_name),
                                               Filename=zip_path)

    return {'zip_name': zip_name, 'tail': path_input[-1]}
Esempio n. 3
0
def handler(config, task, updater, main_input, path_input, merge_inputs):
    """
    A simple example of a merge handler that downloads sorted segments of a dictionary from S3, merges them together
    into a single sorted dictionary, and uploads the results back to S3.
    """
    s3_dir = main_input['s3_dir']
    s3_file = main_input['s3_file']

    merge_sources = []

    for idx, input in enumerate(merge_inputs):
        updater.post_task_update("Downloading chunk %d of %d from S3" %
                                 (idx + 1, len(merge_inputs)))

        zip_name = input['zip_name']
        text_name = zip_name[:-3] + "txt"
        local_zip_path = os.path.join(tempfile.gettempdir(), zip_name)

        config.s3_resource.meta.client.download_file(
            Bucket=config.config_bucket,
            Key=util.s3_key_join(s3_dir, zip_name),
            Filename=local_zip_path)

        with zipfile.ZipFile(local_zip_path, "r") as zp:
            with zp.open(text_name, "r") as fp:
                lines = fp.readlines()

        merge_sources.append(LineSource(lines, len(merge_sources)))

    updater.post_task_update("Merging %d data sources" % len(merge_inputs))

    out_lines = []

    # Merge the multiple independently-sorted sources by popping the minimum off of each stack until there are none left
    while len(merge_sources) > 1:
        # Determine which source has the minimum value, and append its line to the output
        min_source = min(merge_sources, key=lambda x: x.current)
        out_lines.append(min_source.lines[min_source.index])

        # Advance that source to the next line
        if not min_source.advance():
            # We are out of lines remaining in this source, so discard it
            del merge_sources[min_source.list_position]

            # Renumber the remaining sources
            for idx, src in enumerate(merge_sources):
                src.list_position = idx

    # With only one source left, we can just dump the remaining lines from it to the output
    out_lines.extend(merge_sources[0].lines[merge_sources[0].index:])

    # Write the output to a file
    out_basename = "%s_sorted%s" % (s3_file, "".join(
        [str(x) for x in path_input]))
    out_zip_name = out_basename + ".zip"
    out_txt_name = out_basename + ".txt"
    out_zip_path = os.path.join(tempfile.gettempdir(), out_zip_name)

    with zipfile.ZipFile(out_zip_path, "w", zipfile.ZIP_DEFLATED) as zp:
        with zp.open(out_txt_name, "w") as fp:
            fp.write(b"".join(out_lines))

    updater.post_task_update("Uploading results to S3")

    config.s3_resource.meta.client.upload_file(Filename=out_zip_path,
                                               Bucket=config.config_bucket,
                                               Key=util.s3_key_join(
                                                   s3_dir, out_zip_name))

    return {
        'zip_name': out_zip_name,
        'tail': path_input[-1] if len(path_input) else None
    }