def disk_benchmark(
    input_directory, output_directory, benchmark_size_per_disk, **kwargs):

    if output_directory is None:
        output_directory = utils.sibling_directory(
            input_directory, "disk_speeds")


    (input_url, output_url) = utils.generate_urls(
        input_directory, output_directory, None)

    config = utils.mapreduce_job(
        input_dir = input_url,
        output_dir = output_url,
        map_function = "DiskBenchmarkMapFunction",
        reduce_function = "DiskBenchmarkReduceFunction")

    utils.force_single_partition(config)

    data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B"))

    config_params = {
        "DISK_BENCHMARK_DATA_SIZE" : data_size_bytes
        }

    if "params" not in config:
        config["params"] = {}

    for key, value in config_params.items():
        config["params"][key] = value

    return config
Beispiel #2
0
def disk_benchmark(input_directory, output_directory, benchmark_size_per_disk,
                   **kwargs):

    if output_directory is None:
        output_directory = utils.sibling_directory(input_directory,
                                                   "disk_speeds")

    (input_url, output_url) = utils.generate_urls(input_directory,
                                                  output_directory, None)

    config = utils.mapreduce_job(input_dir=input_url,
                                 output_dir=output_url,
                                 map_function="DiskBenchmarkMapFunction",
                                 reduce_function="DiskBenchmarkReduceFunction")

    utils.force_single_partition(config)

    data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B"))

    config_params = {"DISK_BENCHMARK_DATA_SIZE": data_size_bytes}

    if "params" not in config:
        config["params"] = {}

    for key, value in config_params.items():
        config["params"][key] = value

    return config
def merge_files(input_directory, output_directory, **kwargs):
    if output_directory is None:
        output_directory = utils.sibling_directory(input_directory, "%(dirname)s_merged")

    (input_url, output_url) = utils.generate_urls(input_directory, output_directory)

    config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url)

    utils.force_single_partition(config)

    return config
Beispiel #4
0
def merge_files(input_directory, output_directory, hdfs, **kwargs):
    if output_directory is None:
        output_directory = utils.sibling_directory(
            input_directory, "%(dirname)s_merged")

    (input_url, output_url) = utils.generate_urls(
        input_directory, output_directory, hdfs)

    config = utils.mapreduce_job(
        input_dir = input_url, output_dir = output_url)

    utils.force_single_partition(config)

    return config
def sum_values(input_directory, output_directory, **kwargs):
    if output_directory is None:
        output_directory = utils.sibling_directory(input_directory,
                                                   "%(dirname)s_sumcounts")

    (input_url, output_url) = utils.generate_urls(input_directory,
                                                  output_directory)

    config = utils.mapreduce_job(input_dir=input_url,
                                 output_dir=output_url,
                                 map_function="ZeroKeyMapFunction",
                                 reduce_function="SumValuesReduceFunction")

    utils.force_single_partition(config)

    return config
def sum_values(input_directory, output_directory, **kwargs):
    if output_directory is None:
        output_directory = utils.sibling_directory(
            input_directory, "%(dirname)s_sumcounts")

    (input_url, output_url) = utils.generate_urls(
        input_directory, output_directory)

    config = utils.mapreduce_job(
        input_dir = input_url,
        output_dir = output_url,
        map_function = "ZeroKeyMapFunction",
        reduce_function = "SumValuesReduceFunction")

    utils.force_single_partition(config)

    return config