def test_merge_files(self):
     file_to_read1 = "FMI.txt"
     file_read1 = open(file_to_read1, "r")
     content1 = file_read1.read()
     file_read1.seek(0)
     file_to_write = "merge.txt"
     file_write = open(file_to_write, "w")
     file_to_read2 = "Hack Bulgaria.txt"
     file_read2 = open(file_to_read2, "r")
     content2 = file_read2.read()
     file_read2.seek(0)
     data = content1 + content2
     file_write.write(data)
     print(merge_files("FMI.txt", "Hack Bulgaria.txt", file_to_write))
     self.assertEqual(file_read1.read() + file_read2.read(), merge_files("FMI.txt", "Hack Bulgaria.txt", file_to_write))
     file_read1.close()
     file_read2.close()
     file_write.close()
def cloudburst(
    input_directory,
    output_directory,
    hdfs,
    min_read_len,
    max_read_len,
    max_align_diff,
    redundancy,
    allow_differences,
    block_size,
    **kwargs
):

    if output_directory is None:
        output_directory = utils.sibling_directory(input_directory, "%(dirname)s_cloudburst_aligned")

    intermediate_directory = utils.sibling_directory(input_directory, "%(dirname)s_cloudburst_unmerged")

    (input_url, intermediate_url) = utils.generate_urls(input_directory, intermediate_directory, hdfs)

    cloudburst_config = utils.mapreduce_job(
        input_dir=input_url,
        output_dir=intermediate_url,
        map_function="CloudBurstMapFunction",
        reduce_function="CloudBurstReduceFunction",
        partition_function="CloudBurstPartitionFunction",
    )

    seed_len = min_read_len / (max_align_diff + 1)
    flank_len = max_read_len - seed_len + max_align_diff

    cloudburst_params = {
        "CLOUDBURST_MIN_READ_LEN": min_read_len,
        "CLOUDBURST_MAX_READ_LEN": max_read_len,
        "CLOUDBURST_MAX_ALIGN_DIFF": max_align_diff,
        "CLOUDBURST_SEED_LEN": seed_len,
        "CLOUDBURST_FLANK_LEN": flank_len,
        "CLOUDBURST_REDUNDANCY": redundancy,
        "CLOUDBURST_ALLOW_DIFFERENCES": int(allow_differences),
        "CLOUDBURST_BLOCK_SIZE": block_size,
    }

    if "params" not in cloudburst_config:
        cloudburst_config["params"] = {}

    for key, value in cloudburst_params.items():
        cloudburst_config["params"][key] = value

    mergefiles_config = merge_files(intermediate_directory, output_directory, hdfs)

    return utils.run_in_sequence(cloudburst_config, mergefiles_config)
Beispiel #3
0
def rdrand(username, hdfs, input_directory, output_directory, **kwargs):

    if output_directory is None:
        output_directory = "%s/outputs" % (username)
    rdrand_output_directory = "%s/rdrand" % (output_directory)
    merged_output_directory = "%s/final_output" % (output_directory)

    (input_url,
     rdrand_output_url) = utils.generate_urls(input_directory,
                                              rdrand_output_directory, hdfs)
    merged_output_url = utils.generate_url(merged_output_directory, hdfs)

    rdrand_config = utils.mapreduce_job(
        input_dir=input_url,
        output_dir=rdrand_output_url,
        map_function="PassThroughMapFunction",
        reduce_function="CountDuplicateKeysReduceFunction",
        partition_function="UniformPartitionFunction")

    rdrand_params = {
        "SKIP_PHASE_ZERO": 1,  # Don't sample...
        "INTERMEDIATE_TO_INPUT_RATIO": 3.0,  #... instead assume ratio of 3
        "MAP_INPUT_FORMAT_READER": "RdRandFormatReader",  # 64-bit fragments
        "REDUCE_INPUT_FORMAT_READER":
        "FixedSizeKVPairFormatReader",  # no header
        "REDUCE_INPUT_FIXED_KEY_LENGTH": 16,  # 128-bit intermediate keys...
        "REDUCE_INPUT_FIXED_VALUE_LENGTH": 0,  # ... with empty values
        "WRITE_WITHOUT_HEADERS.phase_one": 1  # no headers
    }

    if "params" not in rdrand_config:
        rdrand_config["params"] = {}

    for key, value in rdrand_params.items():
        rdrand_config["params"][key] = value

    # Run a second job to merge all duplicate key information into a single
    # output file for better readability.
    mergefiles_config = merge_files(rdrand_output_directory,
                                    merged_output_directory, hdfs)

    return utils.run_in_sequence(rdrand_config, mergefiles_config)
Beispiel #4
0
def rdrand(username, input_directory, output_directory, **kwargs):

    if output_directory is None:
        output_directory = "%s/outputs" % (username)
    rdrand_output_directory = "%s/rdrand" % (output_directory)
    merged_output_directory = "%s/final_output" % (output_directory)

    (input_url, rdrand_output_url) = utils.generate_urls(
        input_directory, rdrand_output_directory)
    merged_output_url = utils.generate_url(merged_output_directory)

    rdrand_config = utils.mapreduce_job(
        input_dir = input_url,
        output_dir = rdrand_output_url,
        map_function = "PassThroughMapFunction",
        reduce_function = "CountDuplicateKeysReduceFunction",
        partition_function = "UniformPartitionFunction")

    rdrand_params = {
        "SKIP_PHASE_ZERO": 1, # Don't sample...
        "INTERMEDIATE_TO_INPUT_RATIO": 3.0, #... instead assume ratio of 3
        "MAP_INPUT_FORMAT_READER" : "RdRandFormatReader", # 64-bit fragments
        "REDUCE_INPUT_FORMAT_READER": "FixedSizeKVPairFormatReader", # no header
        "REDUCE_INPUT_FIXED_KEY_LENGTH": 16, # 128-bit intermediate keys...
        "REDUCE_INPUT_FIXED_VALUE_LENGTH": 0, # ... with empty values
        "WRITE_WITHOUT_HEADERS.phase_one": 1 # no headers
        }

    if "params" not in rdrand_config:
        rdrand_config["params"] = {}

    for key, value in rdrand_params.items():
        rdrand_config["params"][key] = value

    # Run a second job to merge all duplicate key information into a single
    # output file for better readability.
    mergefiles_config = merge_files(
        rdrand_output_directory, merged_output_directory)

    return utils.run_in_sequence(rdrand_config, mergefiles_config)
Beispiel #5
0
def tuple_length_count(input_directory, output_directory, **kwargs):

    if output_directory is None:
        output_directory = utils.sibling_directory(
            input_directory, "tuple_length_counter_%(dirname)s")

    intermediate_directory = utils.sibling_directory(
        input_directory, "unmerged_counts_%(dirname)s")

    (input_url, output_url) = utils.generate_urls(
        input_directory, intermediate_directory)

    tuple_length_config = utils.mapreduce_job(
        input_dir = input_url,
        output_dir = output_url,
        map_function = "TupleLengthCounterMapFunction",
        reduce_function = "SumValuesReduceFunction")

    merge_files_config = merge_files.merge_files(
        intermediate_directory, output_directory)

    config = utils.run_in_sequence(tuple_length_config, merge_files_config)

    return config
Beispiel #6
0
path_name = args.in_path
loss_weights = [args.weight_class, args.weight_regress]

# if True it retrains, wlse it reads a model
retrain = True  # train the NN (~20 min), otherwise it needs a model from disc.
# name of the model for saving or loading
model_files = 'selu_mode'

# make the output directories
os.mkdir(os.getcwd() + out_path)
out_path = os.getcwd() + out_path

###########################################
# get the data !!! ########################

(X_train, is_MC_train, ref_train, bmass_train, Y_train) = merge_files(
    path_name, 'train', n_vertex=n_vertex)
(X_test, is_MC, ref_test, bmass_test, Y_test) = merge_files(
    path_name, 'test', n_vertex=n_vertex)

# though shall preprocess (var = 1, mean = 0) for gradiend based learning!
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

############################################


# the below weights make sure that MC is not used for the second loss,
# which is the mass regression
background = 1 - is_MC_train
all_events = np.ones(background.shape)