def disk_benchmark(input_directory, output_directory, benchmark_size_per_disk, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "disk_speeds") (input_url, output_url) = utils.generate_urls(input_directory, output_directory, None) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="DiskBenchmarkMapFunction", reduce_function="DiskBenchmarkReduceFunction") utils.force_single_partition(config) data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B")) config_params = {"DISK_BENCHMARK_DATA_SIZE": data_size_bytes} if "params" not in config: config["params"] = {} for key, value in config_params.items(): config["params"][key] = value return config
def pagerank(input_directory, output_directory, iterations, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_with_pageranks") config_list = [] current_input = input_directory current_output = None for iteration in xrange(iterations): if iteration == iterations - 1: current_output = output_directory else: current_output = utils.sibling_directory( input_directory, "%(dirname)s_pagerank_iteration_" + str(iteration + 1)) (input_url, output_url) = utils.generate_urls( current_input, current_output) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "PageRankMapFunction", reduce_function = "PageRankReduceFunction" ) config_list.append(config) current_input = current_output pagerank_config = utils.run_in_sequence(*config_list) return pagerank_config
def disk_benchmark( input_directory, output_directory, benchmark_size_per_disk, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "disk_speeds") (input_url, output_url) = utils.generate_urls( input_directory, output_directory, None) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "DiskBenchmarkMapFunction", reduce_function = "DiskBenchmarkReduceFunction") utils.force_single_partition(config) data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B")) config_params = { "DISK_BENCHMARK_DATA_SIZE" : data_size_bytes } if "params" not in config: config["params"] = {} for key, value in config_params.items(): config["params"][key] = value return config
def pagerank(input_directory, output_directory, iterations, hdfs, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_with_pageranks") config_list = [] current_input = input_directory current_output = None for iteration in xrange(iterations): if iteration == iterations - 1: current_output = output_directory else: current_output = utils.sibling_directory( input_directory, "%(dirname)s_pagerank_iteration_" + str(iteration + 1)) (input_url, output_url) = utils.generate_urls(current_input, current_output, hdfs) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="PageRankMapFunction", reduce_function="PageRankReduceFunction") config_list.append(config) current_input = current_output pagerank_config = utils.run_in_sequence(*config_list) return pagerank_config
def merge_files(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_merged") (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url) utils.force_single_partition(config) return config
def cloudburst( input_directory, output_directory, hdfs, min_read_len, max_read_len, max_align_diff, redundancy, allow_differences, block_size, **kwargs ): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_cloudburst_aligned") intermediate_directory = utils.sibling_directory(input_directory, "%(dirname)s_cloudburst_unmerged") (input_url, intermediate_url) = utils.generate_urls(input_directory, intermediate_directory, hdfs) cloudburst_config = utils.mapreduce_job( input_dir=input_url, output_dir=intermediate_url, map_function="CloudBurstMapFunction", reduce_function="CloudBurstReduceFunction", partition_function="CloudBurstPartitionFunction", ) seed_len = min_read_len / (max_align_diff + 1) flank_len = max_read_len - seed_len + max_align_diff cloudburst_params = { "CLOUDBURST_MIN_READ_LEN": min_read_len, "CLOUDBURST_MAX_READ_LEN": max_read_len, "CLOUDBURST_MAX_ALIGN_DIFF": max_align_diff, "CLOUDBURST_SEED_LEN": seed_len, "CLOUDBURST_FLANK_LEN": flank_len, "CLOUDBURST_REDUNDANCY": redundancy, "CLOUDBURST_ALLOW_DIFFERENCES": int(allow_differences), "CLOUDBURST_BLOCK_SIZE": block_size, } if "params" not in cloudburst_config: cloudburst_config["params"] = {} for key, value in cloudburst_params.items(): cloudburst_config["params"][key] = value mergefiles_config = merge_files(intermediate_directory, output_directory, hdfs) return utils.run_in_sequence(cloudburst_config, mergefiles_config)
def merge_files(input_directory, output_directory, hdfs, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_merged") (input_url, output_url) = utils.generate_urls( input_directory, output_directory, hdfs) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url) utils.force_single_partition(config) return config
def sum_values(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_sumcounts") (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="ZeroKeyMapFunction", reduce_function="SumValuesReduceFunction") utils.force_single_partition(config) return config
def sum_values(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_sumcounts") (input_url, output_url) = utils.generate_urls( input_directory, output_directory) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "ZeroKeyMapFunction", reduce_function = "SumValuesReduceFunction") utils.force_single_partition(config) return config
def gen_text_wex(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_wex_text") (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job( input_dir=input_url, output_dir=output_url, map_function="WEXTextExtractorMapFunction", reduce_function="IdentityReduceFunction", ) if "params" not in config: config["params"] = {} config["params"]["MAP_INPUT_FORMAT_READER"] = "TextLineFormatReader" return config
def gen_text_wex(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_wex_text") (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="WEXTextExtractorMapFunction", reduce_function="IdentityReduceFunction") if "params" not in config: config["params"] = {} config["params"]["MAP_INPUT_FORMAT_READER"] = "TextLineFormatReader" return config
def rdrand(username, hdfs, input_directory, output_directory, **kwargs): if output_directory is None: output_directory = "%s/outputs" % (username) rdrand_output_directory = "%s/rdrand" % (output_directory) merged_output_directory = "%s/final_output" % (output_directory) (input_url, rdrand_output_url) = utils.generate_urls(input_directory, rdrand_output_directory, hdfs) merged_output_url = utils.generate_url(merged_output_directory, hdfs) rdrand_config = utils.mapreduce_job( input_dir=input_url, output_dir=rdrand_output_url, map_function="PassThroughMapFunction", reduce_function="CountDuplicateKeysReduceFunction", partition_function="UniformPartitionFunction") rdrand_params = { "SKIP_PHASE_ZERO": 1, # Don't sample... "INTERMEDIATE_TO_INPUT_RATIO": 3.0, #... instead assume ratio of 3 "MAP_INPUT_FORMAT_READER": "RdRandFormatReader", # 64-bit fragments "REDUCE_INPUT_FORMAT_READER": "FixedSizeKVPairFormatReader", # no header "REDUCE_INPUT_FIXED_KEY_LENGTH": 16, # 128-bit intermediate keys... "REDUCE_INPUT_FIXED_VALUE_LENGTH": 0, # ... with empty values "WRITE_WITHOUT_HEADERS.phase_one": 1 # no headers } if "params" not in rdrand_config: rdrand_config["params"] = {} for key, value in rdrand_params.items(): rdrand_config["params"][key] = value # Run a second job to merge all duplicate key information into a single # output file for better readability. mergefiles_config = merge_files(rdrand_output_directory, merged_output_directory, hdfs) return utils.run_in_sequence(rdrand_config, mergefiles_config)
def ngram_count(input_directory, output_directory, ngram_count, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_" + ("%dgram_counts" % (ngram_count))) (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="NGramMapFunction", reduce_function="WordCountReduceFunction") if "params" not in config: config["params"] = {} config["params"]["NGRAM_COUNT"] = ngram_count return config
def gen_network_wex(input_directory, output_directory, hdfs, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_wex_graph") (input_url, output_url) = utils.generate_urls(input_directory, output_directory, hdfs) config = utils.mapreduce_job( input_dir=input_url, output_dir=output_url, map_function="WEXLinkExtractorMapFunction", reduce_function="WEXAdjacencyToPageRankReducer") if "params" not in config: config["params"] = {} config["params"]["MAP_INPUT_FORMAT_READER"] = "TextLineFormatReader" return config
def wordcount(username, hdfs, input_directory, output_directory, use_combiner, **kwargs): if output_directory is None: output_directory = "%s/outputs/wordcount" % (username) (input_url, output_url) = utils.generate_urls(input_directory, output_directory, hdfs) if use_combiner: map_function = "CombiningWordCountMapFunction" else: map_function = "WordCountMapFunction" config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function=map_function, reduce_function="WordCountReduceFunction") return config
def ngram_count(input_directory, output_directory, ngram_count, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_" + ("%dgram_counts" % (ngram_count))) (input_url, output_url) = utils.generate_urls( input_directory, output_directory) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "NGramMapFunction", reduce_function = "WordCountReduceFunction") if "params" not in config: config["params"] = {} config["params"]["NGRAM_COUNT"] = ngram_count return config
def gen_network_wex(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_wex_graph") (input_url, output_url) = utils.generate_urls( input_directory, output_directory) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "WEXLinkExtractorMapFunction", reduce_function = "WEXAdjacencyToPageRankReducer") if "params" not in config: config["params"] = {} config["params"]["MAP_INPUT_FORMAT_READER"] = "TextLineFormatReader" return config
def wordcount( username, input_directory, output_directory, use_combiner, **kwargs): if output_directory is None: output_directory = "%s/outputs/wordcount" % (username) (input_url, output_url) = utils.generate_urls( input_directory, output_directory) if use_combiner: map_function = "CombiningWordCountMapFunction" else: map_function = "WordCountMapFunction" config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = map_function, reduce_function = "WordCountReduceFunction") return config
def rdrand(username, input_directory, output_directory, **kwargs): if output_directory is None: output_directory = "%s/outputs" % (username) rdrand_output_directory = "%s/rdrand" % (output_directory) merged_output_directory = "%s/final_output" % (output_directory) (input_url, rdrand_output_url) = utils.generate_urls( input_directory, rdrand_output_directory) merged_output_url = utils.generate_url(merged_output_directory) rdrand_config = utils.mapreduce_job( input_dir = input_url, output_dir = rdrand_output_url, map_function = "PassThroughMapFunction", reduce_function = "CountDuplicateKeysReduceFunction", partition_function = "UniformPartitionFunction") rdrand_params = { "SKIP_PHASE_ZERO": 1, # Don't sample... "INTERMEDIATE_TO_INPUT_RATIO": 3.0, #... instead assume ratio of 3 "MAP_INPUT_FORMAT_READER" : "RdRandFormatReader", # 64-bit fragments "REDUCE_INPUT_FORMAT_READER": "FixedSizeKVPairFormatReader", # no header "REDUCE_INPUT_FIXED_KEY_LENGTH": 16, # 128-bit intermediate keys... "REDUCE_INPUT_FIXED_VALUE_LENGTH": 0, # ... with empty values "WRITE_WITHOUT_HEADERS.phase_one": 1 # no headers } if "params" not in rdrand_config: rdrand_config["params"] = {} for key, value in rdrand_params.items(): rdrand_config["params"][key] = value # Run a second job to merge all duplicate key information into a single # output file for better readability. mergefiles_config = merge_files( rdrand_output_directory, merged_output_directory) return utils.run_in_sequence(rdrand_config, mergefiles_config)
def click_logs( input_directory, output_directory, session_time_threshold, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "session_logs_%(dirname)s") (input_url, output_url) = utils.generate_urls( input_directory, output_directory) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "PassThroughMapFunction", reduce_function = "ClickLogSessionSummarizerReduceFunction") if "params" not in config: config["params"] = {} config["params"]["USE_SECONDARY_KEYS"] = 1 config["params"]["CLICK_LOG_SUMMARIZER_SESSION_TIME_THRESHOLD"] = ( session_time_threshold) return config
def tuple_length_count(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "tuple_length_counter_%(dirname)s") intermediate_directory = utils.sibling_directory( input_directory, "unmerged_counts_%(dirname)s") (input_url, output_url) = utils.generate_urls( input_directory, intermediate_directory) tuple_length_config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "TupleLengthCounterMapFunction", reduce_function = "SumValuesReduceFunction") merge_files_config = merge_files.merge_files( intermediate_directory, output_directory) config = utils.run_in_sequence(tuple_length_config, merge_files_config) return config
def click_logs( input_directory, output_directory, hdfs, session_time_threshold, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "session_logs_%(dirname)s") (input_url, output_url) = utils.generate_urls( input_directory, output_directory, hdfs) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "PassThroughMapFunction", reduce_function = "ClickLogSessionSummarizerReduceFunction") if "params" not in config: config["params"] = {} config["params"]["USE_SECONDARY_KEYS"] = 1 config["params"]["CLICK_LOG_SUMMARIZER_SESSION_TIME_THRESHOLD"] = ( session_time_threshold) return config