Example #1
0
def local_sort(key_arrs, data):
    # convert StringArray to list(string) to enable swapping in sort
    l_key_arrs = to_string_list(key_arrs)
    l_data = to_string_list(data)
    n_out = len(key_arrs[0])
    sort_state_o = SortState(l_key_arrs, n_out, l_data)
    hpat.timsort.sort(sort_state_o, l_key_arrs, 0, n_out, l_data)
    cp_str_list_to_array(key_arrs, l_key_arrs)
    cp_str_list_to_array(data, l_data)
Example #2
0
def local_sort(key_arrs, data, ascending=True):
    # convert StringArray to list(string) to enable swapping in sort
    l_key_arrs = to_string_list(key_arrs)
    l_data = to_string_list(data)
    n_out = len(key_arrs[0])
    hpat.timsort.sort(l_key_arrs, 0, n_out, l_data)
    if not ascending:
        hpat.timsort.reverseRange(l_key_arrs, 0, n_out, l_data)
    cp_str_list_to_array(key_arrs, l_key_arrs)
    cp_str_list_to_array(data, l_data)
Example #3
0
def parallel_sort(key_arrs, data, ascending=True):
    n_local = len(key_arrs[0])
    n_total = hpat.distributed_api.dist_reduce(n_local,
                                               np.int32(Reduce_Type.Sum.value))

    n_pes = hpat.distributed_api.get_size()
    my_rank = hpat.distributed_api.get_rank()

    # similar to Spark's sample computation Partitioner.scala
    sampleSize = min(samplePointsPerPartitionHint * n_pes, MIN_SAMPLES)

    fraction = min(sampleSize / max(n_total, 1), 1.0)
    n_loc_samples = min(math.ceil(fraction * n_local), n_local)
    inds = np.random.randint(0, n_local, n_loc_samples)
    samples = key_arrs[0][inds]
    # print(sampleSize, fraction, n_local, n_loc_samples, len(samples))

    all_samples = hpat.distributed_api.gatherv(samples)
    all_samples = to_string_list(all_samples)
    bounds = empty_like_type(n_pes - 1, all_samples)

    if my_rank == MPI_ROOT:
        all_samples.sort()
        if not ascending:
            all_samples = all_samples[::-1]
        n_samples = len(all_samples)
        step = math.ceil(n_samples / n_pes)
        for i in range(n_pes - 1):
            bounds[i] = all_samples[min((i + 1) * step, n_samples - 1)]
        # print(bounds)

    bounds = str_list_to_array(bounds)
    bounds = hpat.distributed_api.prealloc_str_for_bcast(bounds)
    hpat.distributed_api.bcast(bounds)

    # calc send/recv counts
    pre_shuffle_meta = alloc_pre_shuffle_metadata(key_arrs, data, n_pes, True)
    node_id = 0
    for i in range(n_local):
        val = key_arrs[0][i]
        # TODO: refactor
        if node_id < (n_pes - 1) and (ascending and val >= bounds[node_id] or
                                      (not ascending)
                                      and val <= bounds[node_id]):
            node_id += 1
        update_shuffle_meta(pre_shuffle_meta, node_id, i, (val, ),
                            getitem_arr_tup(data, i), True)

    shuffle_meta = finalize_shuffle_meta(key_arrs, data, pre_shuffle_meta,
                                         n_pes, True)

    # shuffle
    recvs = alltoallv_tup(key_arrs + data, shuffle_meta)
    out_key = _get_keys_tup(recvs, key_arrs)
    out_data = _get_data_tup(recvs, key_arrs)

    return out_key, out_data
Example #4
0
def parallel_sort(key_arr, data):
    n_local = len(key_arr)
    n_total = hpat.distributed_api.dist_reduce(n_local,
                                               np.int32(Reduce_Type.Sum.value))

    n_pes = hpat.distributed_api.get_size()
    my_rank = hpat.distributed_api.get_rank()

    # similar to Spark's sample computation Partitioner.scala
    sampleSize = min(samplePointsPerPartitionHint * n_pes, MIN_SAMPLES)

    fraction = min(sampleSize / max(n_total, 1), 1.0)
    n_loc_samples = min(math.ceil(fraction * n_local), n_local)
    inds = np.random.randint(0, n_local, n_loc_samples)
    samples = key_arr[inds]
    # print(sampleSize, fraction, n_local, n_loc_samples, len(samples))

    all_samples = hpat.distributed_api.gatherv(samples)
    all_samples = to_string_list(all_samples)
    bounds = empty_like_type(n_pes - 1, all_samples)

    if my_rank == MPI_ROOT:
        all_samples.sort()
        n_samples = len(all_samples)
        step = math.ceil(n_samples / n_pes)
        for i in range(n_pes - 1):
            bounds[i] = all_samples[min((i + 1) * step, n_samples - 1)]
        # print(bounds)

    bounds = str_list_to_array(bounds)
    bounds = hpat.distributed_api.prealloc_str_for_bcast(bounds)
    hpat.distributed_api.bcast(bounds)

    # calc send/recv counts
    shuffle_meta = alloc_shuffle_metadata(key_arr, n_pes, True)
    data_shuffle_meta = data_alloc_shuffle_metadata(data, n_pes, True)
    node_id = 0
    for i in range(n_local):
        val = key_arr[i]
        if node_id < (n_pes - 1) and val >= bounds[node_id]:
            node_id += 1
        update_shuffle_meta(shuffle_meta, node_id, i, val)
        update_data_shuffle_meta(data_shuffle_meta, node_id, i, data)

    finalize_shuffle_meta(key_arr, shuffle_meta, True)
    finalize_data_shuffle_meta(data, data_shuffle_meta, shuffle_meta, True)

    # shuffle
    alltoallv(key_arr, shuffle_meta)
    out_data = alltoallv_tup(data, data_shuffle_meta, shuffle_meta)

    return shuffle_meta.out_arr, out_data